aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-12-01 14:51:49 -0800
committerStephen Hines <srhines@google.com>2014-12-02 16:08:10 -0800
commit37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch)
tree8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /test/CodeGen/X86
parentd2327b22152ced7bc46dc629fc908959e8a52d03 (diff)
downloadexternal_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip
external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz
external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2
Update aosp/master LLVM for rebase to r222494.
Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll2
-rw-r--r--test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll3
-rw-r--r--test/CodeGen/X86/2008-06-18-BadShuffle.ll10
-rw-r--r--test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll28
-rw-r--r--test/CodeGen/X86/2009-02-26-MachineLICMBug.ll2
-rw-r--r--test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll30
-rw-r--r--test/CodeGen/X86/2009-06-05-VZextByteShort.ll28
-rw-r--r--test/CodeGen/X86/2009-10-16-Scope.ll16
-rw-r--r--test/CodeGen/X86/2010-01-18-DbgValue.ll32
-rw-r--r--test/CodeGen/X86/2010-02-01-DbgValueCrash.ll28
-rw-r--r--test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll6
-rw-r--r--test/CodeGen/X86/2010-05-25-DotDebugLoc.ll87
-rw-r--r--test/CodeGen/X86/2010-05-26-DotDebugLoc.ll68
-rw-r--r--test/CodeGen/X86/2010-05-28-Crash.ll30
-rw-r--r--test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll62
-rw-r--r--test/CodeGen/X86/2010-07-06-DbgCrash.ll26
-rw-r--r--test/CodeGen/X86/2010-08-04-StackVariable.ll78
-rw-r--r--test/CodeGen/X86/2010-09-16-EmptyFilename.ll22
-rw-r--r--test/CodeGen/X86/2010-11-02-DbgParameter.ll28
-rw-r--r--test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll48
-rw-r--r--test/CodeGen/X86/2011-08-29-InitOrder.ll2
-rw-r--r--test/CodeGen/X86/2012-04-26-sdglue.ll2
-rw-r--r--test/CodeGen/X86/2012-07-15-broadcastfold.ll1
-rw-r--r--test/CodeGen/X86/2012-10-02-DAGCycle.ll4
-rw-r--r--test/CodeGen/X86/2012-11-30-handlemove-dbg.ll22
-rw-r--r--test/CodeGen/X86/2012-11-30-misched-dbg.ll53
-rw-r--r--test/CodeGen/X86/2012-11-30-regpres-dbg.ll20
-rw-r--r--test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll2
-rw-r--r--test/CodeGen/X86/2014-08-29-CompactUnwind.ll46
-rw-r--r--test/CodeGen/X86/MachineSink-DbgValue.ll32
-rw-r--r--test/CodeGen/X86/StackColoring-dbg.ll12
-rw-r--r--test/CodeGen/X86/SwizzleShuff.ll2
-rw-r--r--test/CodeGen/X86/TruncAssertZext.ll16
-rw-r--r--test/CodeGen/X86/add-of-carry.ll2
-rw-r--r--test/CodeGen/X86/add_shl_constant.ll49
-rw-r--r--test/CodeGen/X86/addr-mode-matcher.ll62
-rw-r--r--test/CodeGen/X86/address-type-promotion-constantexpr.ll16
-rw-r--r--test/CodeGen/X86/adx-intrinsics.ll77
-rw-r--r--test/CodeGen/X86/aliases.ll6
-rw-r--r--test/CodeGen/X86/aligned-variadic.ll30
-rw-r--r--test/CodeGen/X86/alloca-align-rounding.ll19
-rw-r--r--test/CodeGen/X86/asm-block-labels.ll2
-rw-r--r--test/CodeGen/X86/atomic-load-store-wide.ll10
-rw-r--r--test/CodeGen/X86/atomic-ops-ancient-64.ll1
-rw-r--r--test/CodeGen/X86/atomic_add.ll17
-rw-r--r--test/CodeGen/X86/atomic_idempotent.ll56
-rw-r--r--test/CodeGen/X86/atomic_mi.ll525
-rw-r--r--test/CodeGen/X86/avx-basic.ll40
-rw-r--r--test/CodeGen/X86/avx-blend.ll202
-rw-r--r--test/CodeGen/X86/avx-intel-ocl.ll32
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll26
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll104
-rw-r--r--test/CodeGen/X86/avx-movdup.ll34
-rw-r--r--test/CodeGen/X86/avx-sext.ll199
-rw-r--r--test/CodeGen/X86/avx-shuffle.ll336
-rw-r--r--test/CodeGen/X86/avx-splat.ll17
-rw-r--r--test/CodeGen/X86/avx-vmovddup.ll14
-rw-r--r--test/CodeGen/X86/avx-vperm2f128.ll69
-rw-r--r--test/CodeGen/X86/avx-vperm2x128.ll202
-rw-r--r--test/CodeGen/X86/avx-vpermil.ll54
-rw-r--r--test/CodeGen/X86/avx-vshufp.ll157
-rw-r--r--test/CodeGen/X86/avx-zext.ll41
-rw-r--r--test/CodeGen/X86/avx.ll2
-rw-r--r--test/CodeGen/X86/avx1-stack-reload-folding.ll68
-rw-r--r--test/CodeGen/X86/avx2-blend.ll11
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll33
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86.ll76
-rw-r--r--test/CodeGen/X86/avx2-palignr.ll57
-rw-r--r--test/CodeGen/X86/avx2-shuffle.ll127
-rw-r--r--test/CodeGen/X86/avx2-unpack.ll86
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll2
-rw-r--r--test/CodeGen/X86/avx2-vperm2i128.ll47
-rw-r--r--test/CodeGen/X86/avx512-arith.ll324
-rw-r--r--test/CodeGen/X86/avx512-build-vector.ll35
-rw-r--r--test/CodeGen/X86/avx512-cmp.ll19
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll53
-rw-r--r--test/CodeGen/X86/avx512-fma-intrinsics.ll66
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll25
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll555
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll42
-rw-r--r--test/CodeGen/X86/avx512-mov.ll298
-rw-r--r--test/CodeGen/X86/avx512-select.ll53
-rw-r--r--test/CodeGen/X86/avx512-shuffle.ll314
-rw-r--r--test/CodeGen/X86/avx512-trunc-ext.ll34
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll158
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll338
-rw-r--r--test/CodeGen/X86/avx512-zext-load-crash.ll14
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll305
-rw-r--r--test/CodeGen/X86/avx512bw-mask-op.ll99
-rw-r--r--test/CodeGen/X86/avx512bw-mov.ll81
-rw-r--r--test/CodeGen/X86/avx512bw-vec-cmp.ll135
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll613
-rw-r--r--test/CodeGen/X86/avx512bwvl-mov.ll162
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-cmp.ll269
-rw-r--r--test/CodeGen/X86/avx512dq-mask-op.ll38
-rw-r--r--test/CodeGen/X86/avx512er-intrinsics.ll79
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll613
-rw-r--r--test/CodeGen/X86/avx512vl-mov.ll642
-rw-r--r--test/CodeGen/X86/avx512vl-nontemporal.ll34
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll381
-rw-r--r--test/CodeGen/X86/blend-msb.ll40
-rw-r--r--test/CodeGen/X86/block-placement.ll38
-rw-r--r--test/CodeGen/X86/byval-callee-cleanup.ll27
-rw-r--r--test/CodeGen/X86/cfi_enforcing.ll34
-rw-r--r--test/CodeGen/X86/cfi_invoke.ll35
-rw-r--r--test/CodeGen/X86/cfi_non_default_function.ll27
-rw-r--r--test/CodeGen/X86/cfi_simple_indirect_call.ll43
-rw-r--r--test/CodeGen/X86/cmpxchg-clobber-flags.ll86
-rw-r--r--test/CodeGen/X86/codegen-prepare-addrmode-sext.ll213
-rw-r--r--test/CodeGen/X86/coff-comdat.ll14
-rw-r--r--test/CodeGen/X86/coff-comdat2.ll2
-rw-r--r--test/CodeGen/X86/combine-and.ll164
-rw-r--r--test/CodeGen/X86/combine-or.ll208
-rw-r--r--test/CodeGen/X86/combine-vec-shuffle-2.ll164
-rw-r--r--test/CodeGen/X86/combine-vec-shuffle.ll253
-rw-r--r--test/CodeGen/X86/commute-blend-avx2.ll89
-rw-r--r--test/CodeGen/X86/commute-blend-sse41.ll34
-rw-r--r--test/CodeGen/X86/commuted-blend-mask.ll13
-rw-r--r--test/CodeGen/X86/constant-pool-remat-0.ll2
-rw-r--r--test/CodeGen/X86/constant-pool-sharing.ll11
-rw-r--r--test/CodeGen/X86/constructor.ll12
-rw-r--r--test/CodeGen/X86/cvt16.ll37
-rw-r--r--test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll132
-rw-r--r--test/CodeGen/X86/dbg-changes-codegen.ll14
-rw-r--r--test/CodeGen/X86/divide-by-constant.ll8
-rw-r--r--test/CodeGen/X86/divrem8_ext.ll100
-rw-r--r--test/CodeGen/X86/dllexport-x86_64.ll2
-rw-r--r--test/CodeGen/X86/dllexport.ll2
-rw-r--r--test/CodeGen/X86/dllimport-x86_64.ll2
-rw-r--r--test/CodeGen/X86/dllimport.ll2
-rw-r--r--test/CodeGen/X86/dont-trunc-store-double-to-float.ll20
-rw-r--r--test/CodeGen/X86/dwarf-comp-dir.ll8
-rw-r--r--test/CodeGen/X86/dynamic-alloca-lifetime.ll44
-rw-r--r--test/CodeGen/X86/empty-functions.ll31
-rw-r--r--test/CodeGen/X86/exedepsfix-broadcast.ll9
-rw-r--r--test/CodeGen/X86/extractelement-load.ll39
-rw-r--r--test/CodeGen/X86/fast-isel-args-fail.ll1
-rw-r--r--test/CodeGen/X86/fast-isel-cmp-branch3.ll10
-rw-r--r--test/CodeGen/X86/fast-isel-constpool.ll36
-rw-r--r--test/CodeGen/X86/fast-isel-mem.ll4
-rw-r--r--test/CodeGen/X86/fast-isel-tls.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-x32.ll14
-rw-r--r--test/CodeGen/X86/fast-isel-x86-64.ll20
-rw-r--r--test/CodeGen/X86/fast-isel-x86.ll18
-rw-r--r--test/CodeGen/X86/fastmath-optnone.ll35
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86_64.ll (renamed from test/CodeGen/X86/fma4-intrinsics-x86_64.ll)238
-rw-r--r--test/CodeGen/X86/fma-phi-213-to-231.ll246
-rw-r--r--test/CodeGen/X86/fma.ll8
-rw-r--r--test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll84
-rw-r--r--test/CodeGen/X86/fma_patterns.ll4
-rw-r--r--test/CodeGen/X86/fmaxnum.ll50
-rw-r--r--test/CodeGen/X86/fminnum.ll95
-rw-r--r--test/CodeGen/X86/fmul-combines.ll147
-rw-r--r--test/CodeGen/X86/fnabs.ll77
-rw-r--r--test/CodeGen/X86/fold-pcmpeqd-0.ll117
-rw-r--r--test/CodeGen/X86/fold-tied-op.ll84
-rw-r--r--test/CodeGen/X86/fp-load-trunc.ll96
-rw-r--r--test/CodeGen/X86/fp-trunc.ll90
-rw-r--r--test/CodeGen/X86/fpstack-debuginstr-kill.ll71
-rw-r--r--test/CodeGen/X86/frameaddr.ll15
-rw-r--r--test/CodeGen/X86/gcc_except_table_functions.ll53
-rw-r--r--test/CodeGen/X86/global-sections.ll29
-rw-r--r--test/CodeGen/X86/half.ll69
-rw-r--r--test/CodeGen/X86/i8-umulo.ll24
-rw-r--r--test/CodeGen/X86/inalloca-regparm.ll15
-rw-r--r--test/CodeGen/X86/inline-asm-fpstack.ll62
-rw-r--r--test/CodeGen/X86/jump_sign.ll2
-rw-r--r--test/CodeGen/X86/jump_table_alias.ll3
-rw-r--r--test/CodeGen/X86/jump_table_align.ll29
-rw-r--r--test/CodeGen/X86/jump_table_bitcast.ll9
-rw-r--r--test/CodeGen/X86/jump_tables.ll45
-rw-r--r--test/CodeGen/X86/lea-2.ll5
-rw-r--r--test/CodeGen/X86/lea-3.ll2
-rw-r--r--test/CodeGen/X86/lea-4.ll5
-rw-r--r--test/CodeGen/X86/lea-5.ll59
-rw-r--r--test/CodeGen/X86/lea.ll2
-rw-r--r--test/CodeGen/X86/long-extend.ll18
-rw-r--r--test/CodeGen/X86/loop-strength-reduce8.ll5
-rw-r--r--test/CodeGen/X86/lower-bitcast.ll4
-rw-r--r--test/CodeGen/X86/mem-intrin-base-reg.ll100
-rw-r--r--test/CodeGen/X86/mem-promote-integers.ll4
-rw-r--r--test/CodeGen/X86/misched-matmul.ll2
-rw-r--r--test/CodeGen/X86/movgs.ll86
-rw-r--r--test/CodeGen/X86/ms-inline-asm.ll11
-rw-r--r--test/CodeGen/X86/musttail-varargs.ll119
-rw-r--r--test/CodeGen/X86/nancvt.ll2
-rw-r--r--test/CodeGen/X86/narrow-shl-load.ll34
-rw-r--r--test/CodeGen/X86/nonconst-static-ev.ll1
-rw-r--r--test/CodeGen/X86/nonconst-static-iv.ll1
-rw-r--r--test/CodeGen/X86/nontemporal-2.ll31
-rw-r--r--test/CodeGen/X86/null-streamer.ll14
-rw-r--r--test/CodeGen/X86/object-size.ll6
-rw-r--r--test/CodeGen/X86/osx-private-labels.ll17
-rw-r--r--test/CodeGen/X86/palignr.ll104
-rw-r--r--test/CodeGen/X86/patchpoint-invoke.ll63
-rw-r--r--test/CodeGen/X86/patchpoint-webkit_jscc.ll88
-rw-r--r--test/CodeGen/X86/patchpoint.ll69
-rw-r--r--test/CodeGen/X86/peep-vector-extract-concat.ll11
-rw-r--r--test/CodeGen/X86/peep-vector-extract-insert.ll12
-rw-r--r--test/CodeGen/X86/peephole-fold-movsd.ll31
-rw-r--r--test/CodeGen/X86/pmul.ll100
-rw-r--r--test/CodeGen/X86/pr11334.ll8
-rw-r--r--test/CodeGen/X86/pr12359.ll10
-rw-r--r--test/CodeGen/X86/pr14161.ll23
-rw-r--r--test/CodeGen/X86/pr15267.ll33
-rw-r--r--test/CodeGen/X86/pr18846.ll139
-rw-r--r--test/CodeGen/X86/pr21099.ll10
-rw-r--r--test/CodeGen/X86/pr21529.ll15
-rw-r--r--test/CodeGen/X86/pshufb-mask-comments.ll30
-rw-r--r--test/CodeGen/X86/ragreedy-last-chance-recoloring.ll6
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll109
-rw-r--r--test/CodeGen/X86/return_zeroext_i2.ll7
-rw-r--r--test/CodeGen/X86/segmented-stacks-dynamic.ll22
-rw-r--r--test/CodeGen/X86/segmented-stacks.ll55
-rw-r--r--test/CodeGen/X86/select.ll44
-rw-r--r--test/CodeGen/X86/sext-i1.ll33
-rw-r--r--test/CodeGen/X86/shift-parts.ll4
-rw-r--r--test/CodeGen/X86/shuffle-combine-crash.ll30
-rw-r--r--test/CodeGen/X86/sincos-opt.ll3
-rw-r--r--test/CodeGen/X86/sink-blockfreq.ll45
-rw-r--r--test/CodeGen/X86/sink-out-of-loop.ll23
-rw-r--r--test/CodeGen/X86/slow-incdec.ll80
-rw-r--r--test/CodeGen/X86/splat-for-size.ll141
-rw-r--r--test/CodeGen/X86/splat-scalar-load.ll17
-rw-r--r--test/CodeGen/X86/sqrt-fastmath.ll76
-rw-r--r--test/CodeGen/X86/sse-align-12.ll34
-rw-r--r--test/CodeGen/X86/sse-minmax.ll48
-rw-r--r--test/CodeGen/X86/sse-scalar-fp-arith-2.ll423
-rw-r--r--test/CodeGen/X86/sse-scalar-fp-arith.ll840
-rw-r--r--test/CodeGen/X86/sse1.ll36
-rw-r--r--test/CodeGen/X86/sse2-blend.ll57
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll60
-rw-r--r--test/CodeGen/X86/sse2-mul.ll14
-rw-r--r--test/CodeGen/X86/sse2.ll255
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub-2.ll2
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub.ll155
-rw-r--r--test/CodeGen/X86/sse3.ll259
-rw-r--r--test/CodeGen/X86/sse41-blend.ll140
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll61
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86.ll28
-rw-r--r--test/CodeGen/X86/sse41.ll1046
-rw-r--r--test/CodeGen/X86/stack-protector-dbginfo.ll90
-rw-r--r--test/CodeGen/X86/stack_guard_remat.ll28
-rw-r--r--test/CodeGen/X86/stackmap-fast-isel.ll4
-rw-r--r--test/CodeGen/X86/stackmap-large-constants.ll83
-rw-r--r--test/CodeGen/X86/stackmap-liveness.ll4
-rw-r--r--test/CodeGen/X86/stackmap-nops.ll6
-rw-r--r--test/CodeGen/X86/stackmap-shadow-optimization.ll28
-rw-r--r--test/CodeGen/X86/stackmap.ll25
-rw-r--r--test/CodeGen/X86/store-narrow.ll12
-rw-r--r--test/CodeGen/X86/swizzle-2.ll427
-rw-r--r--test/CodeGen/X86/swizzle.ll19
-rw-r--r--test/CodeGen/X86/tailcall-multiret.ll16
-rw-r--r--test/CodeGen/X86/tls-addr-non-leaf-function.ll37
-rw-r--r--test/CodeGen/X86/trunc-ext-ld-st.ll8
-rw-r--r--test/CodeGen/X86/uint_to_fp-2.ll46
-rw-r--r--test/CodeGen/X86/unknown-location.ll16
-rw-r--r--test/CodeGen/X86/v-binop-widen.ll11
-rw-r--r--test/CodeGen/X86/v-binop-widen2.ll47
-rw-r--r--test/CodeGen/X86/v2f32.ll139
-rw-r--r--test/CodeGen/X86/vararg-callee-cleanup.ll54
-rw-r--r--test/CodeGen/X86/vararg_no_start.ll9
-rw-r--r--test/CodeGen/X86/vastart-defs-eflags.ll2
-rw-r--r--test/CodeGen/X86/vec_cast2.ll190
-rw-r--r--test/CodeGen/X86/vec_compare-2.ll30
-rw-r--r--test/CodeGen/X86/vec_ctbits.ll2
-rw-r--r--test/CodeGen/X86/vec_extract-sse4.ll39
-rw-r--r--test/CodeGen/X86/vec_extract.ll48
-rw-r--r--test/CodeGen/X86/vec_fabs.ll47
-rw-r--r--test/CodeGen/X86/vec_fneg.ll44
-rw-r--r--test/CodeGen/X86/vec_fpext.ll6
-rw-r--r--test/CodeGen/X86/vec_insert-5.ll95
-rw-r--r--test/CodeGen/X86/vec_insert-6.ll9
-rw-r--r--test/CodeGen/X86/vec_insert.ll19
-rw-r--r--test/CodeGen/X86/vec_return.ll2
-rw-r--r--test/CodeGen/X86/vec_set-3.ll44
-rw-r--r--test/CodeGen/X86/vec_set-5.ll28
-rw-r--r--test/CodeGen/X86/vec_set-9.ll14
-rw-r--r--test/CodeGen/X86/vec_set-E.ll9
-rw-r--r--test/CodeGen/X86/vec_set-G.ll9
-rw-r--r--test/CodeGen/X86/vec_set-I.ll13
-rw-r--r--test/CodeGen/X86/vec_set-J.ll10
-rw-r--r--test/CodeGen/X86/vec_setcc.ll6
-rw-r--r--test/CodeGen/X86/vec_sext.ll69
-rw-r--r--test/CodeGen/X86/vec_shuffle-11.ll11
-rw-r--r--test/CodeGen/X86/vec_shuffle-14.ll70
-rw-r--r--test/CodeGen/X86/vec_shuffle-15.ll81
-rw-r--r--test/CodeGen/X86/vec_shuffle-16.ll43
-rw-r--r--test/CodeGen/X86/vec_shuffle-17.ll16
-rw-r--r--test/CodeGen/X86/vec_shuffle-18.ll25
-rw-r--r--test/CodeGen/X86/vec_shuffle-19.ll9
-rw-r--r--test/CodeGen/X86/vec_shuffle-20.ll8
-rw-r--r--test/CodeGen/X86/vec_shuffle-22.ll15
-rw-r--r--test/CodeGen/X86/vec_shuffle-23.ll18
-rw-r--r--test/CodeGen/X86/vec_shuffle-24.ll18
-rw-r--r--test/CodeGen/X86/vec_shuffle-25.ll34
-rw-r--r--test/CodeGen/X86/vec_shuffle-26.ll68
-rw-r--r--test/CodeGen/X86/vec_shuffle-27.ll38
-rw-r--r--test/CodeGen/X86/vec_shuffle-28.ll14
-rw-r--r--test/CodeGen/X86/vec_shuffle-30.ll26
-rw-r--r--test/CodeGen/X86/vec_shuffle-31.ll8
-rw-r--r--test/CodeGen/X86/vec_shuffle-34.ll7
-rw-r--r--test/CodeGen/X86/vec_shuffle-35.ll20
-rw-r--r--test/CodeGen/X86/vec_shuffle-36.ll16
-rw-r--r--test/CodeGen/X86/vec_shuffle-37.ll47
-rw-r--r--test/CodeGen/X86/vec_shuffle-38.ll77
-rw-r--r--test/CodeGen/X86/vec_shuffle-39.ll86
-rw-r--r--test/CodeGen/X86/vec_shuffle-40.ll22
-rw-r--r--test/CodeGen/X86/vec_shuffle-41.ll21
-rw-r--r--test/CodeGen/X86/vec_shuffle.ll50
-rw-r--r--test/CodeGen/X86/vec_splat-2.ll33
-rw-r--r--test/CodeGen/X86/vec_splat-3.ll230
-rw-r--r--test/CodeGen/X86/vec_splat.ll68
-rw-r--r--test/CodeGen/X86/vec_trunc_sext.ll30
-rw-r--r--test/CodeGen/X86/vec_uint_to_fp.ll164
-rw-r--r--test/CodeGen/X86/vec_unsafe-fp-math.ll23
-rw-r--r--test/CodeGen/X86/vec_zext.ll69
-rw-r--r--test/CodeGen/X86/vector-blend.ll708
-rw-r--r--test/CodeGen/X86/vector-idiv.ll1348
-rw-r--r--test/CodeGen/X86/vector-sext.ll943
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll1150
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v2.ll1117
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v4.ll1368
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll2020
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll1267
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll1562
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll748
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll1931
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll1429
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining.ll2536
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse1.ll235
-rw-r--r--test/CodeGen/X86/vector-zext.ll206
-rw-r--r--test/CodeGen/X86/vectorcall.ll93
-rw-r--r--test/CodeGen/X86/vselect-avx.ll85
-rw-r--r--test/CodeGen/X86/vselect.ll278
-rw-r--r--test/CodeGen/X86/widen_cast-1.ll4
-rw-r--r--test/CodeGen/X86/widen_conv-1.ll2
-rw-r--r--test/CodeGen/X86/widen_conversions.ll2
-rw-r--r--test/CodeGen/X86/widen_load-2.ll192
-rw-r--r--test/CodeGen/X86/widen_shuffle-1.ll57
-rw-r--r--test/CodeGen/X86/win32-pic-jumptable.ll36
-rw-r--r--test/CodeGen/X86/win64_call_epi.ll65
-rw-r--r--test/CodeGen/X86/win64_vararg.ll19
-rw-r--r--test/CodeGen/X86/win_cst_pool.ll66
-rw-r--r--test/CodeGen/X86/windows-itanium-alloca.ll16
-rw-r--r--test/CodeGen/X86/x32-function_pointer-1.ll20
-rw-r--r--test/CodeGen/X86/x32-function_pointer-2.ll21
-rw-r--r--test/CodeGen/X86/x32-function_pointer-3.ll30
-rw-r--r--test/CodeGen/X86/x86-64-call.ll15
-rw-r--r--test/CodeGen/X86/x86-64-pic-10.ll2
-rw-r--r--test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll34
-rw-r--r--test/CodeGen/X86/x86-64-tls-1.ll9
-rw-r--r--test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll35
-rw-r--r--test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll74
-rw-r--r--test/CodeGen/X86/xaluo.ll477
355 files changed, 33385 insertions, 8947 deletions
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index 638d399..62c503d 100644
--- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
-@__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once ; <i32 (i32*, void ()*)*> [#uses=0]
+@__gthrw_pthread_once = weak alias i32 (i32*, void ()*)* @pthread_once ; <i32 (i32*, void ()*)*> [#uses=0]
define weak i32 @pthread_once(i32*, void ()*) {
ret i32 0
diff --git a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
index d2d5149..35857b7 100644
--- a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
+++ b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xor | grep CPI
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; CHECK: xorpd {{.*}}{{LCPI0_0|__xmm@}}
define void @casin({ double, double }* sret %agg.result, double %z.0, double %z.1) nounwind {
entry:
%memtmp = alloca { double, double }, align 8 ; <{ double, double }*> [#uses=3]
diff --git a/test/CodeGen/X86/2008-06-18-BadShuffle.ll b/test/CodeGen/X86/2008-06-18-BadShuffle.ll
deleted file mode 100644
index 66f9065..0000000
--- a/test/CodeGen/X86/2008-06-18-BadShuffle.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse2 | grep pinsrw
-
-; Test to make sure we actually insert the bottom element of the vector
-define <8 x i16> @a(<8 x i16> %a) nounwind {
-entry:
- shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> < i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 >
- %add = add <8 x i16> %0, %a
- ret <8 x i16> %add
-}
-
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 296f0ca..207d122 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -14,9 +14,9 @@ entry:
%2 = alloca i64 ; <i64*> [#uses=1]
%3 = alloca i64 ; <i64*> [#uses=6]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0), !dbg !7
+ call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !7
store i8* %s1, i8** %s1_addr
- call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8), !dbg !7
+ call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8, metadata !{metadata !"0x102"}), !dbg !7
%4 = call i8* @llvm.stacksave(), !dbg !7 ; <i8*> [#uses=1]
store i8* %4, i8** %saved_stack.1, align 8, !dbg !7
%5 = load i8** %s1_addr, align 8, !dbg !13 ; <i8*> [#uses=1]
@@ -58,7 +58,7 @@ return: ; preds = %entry
ret i8 %retval12, !dbg !16
}
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
declare i8* @llvm.stacksave() nounwind
@@ -66,21 +66,21 @@ declare i64 @strlen(i8*) nounwind readonly
declare void @llvm.stackrestore(i8*) nounwind
-!0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00s1\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, metadata !2, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!4 = metadata !{metadata !5, metadata !6}
-!5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
+!5 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !5} ; [ DW_TAG_pointer_type ]
!7 = metadata !{i32 2, i32 0, metadata !1, null}
-!8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!8 = metadata !{metadata !"0x100\00str.0\003\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, metadata !2, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
!11 = metadata !{metadata !12}
-!12 = metadata !{i32 458785, i64 0, i64 1} ; [ DW_TAG_subrange_type ]
+!12 = metadata !{metadata !"0x21\000\001"} ; [ DW_TAG_subrange_type ]
!13 = metadata !{i32 3, i32 0, metadata !14, null}
-!14 = metadata !{i32 458763, metadata !17, metadata !1, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\000\000\000", metadata !17, metadata !1} ; [ DW_TAG_lexical_block ]
!15 = metadata !{i32 4, i32 0, metadata !14, null}
!16 = metadata !{i32 5, i32 0, metadata !14, null}
!17 = metadata !{metadata !"vla.c", metadata !"/tmp/"}
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 764c2cd..e046b96 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm"
; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
; rdar://6627786
; rdar://7792037
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
deleted file mode 100644
index e1930e0..0000000
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN: -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
-; RUN: FileCheck %s
-; rdar://6808032
-
-; CHECK: pextrw $14
-; CHECK-NEXT: shrl $8
-; CHECK-NEXT: pinsrw
-
-define void @update(i8** %args_list) nounwind {
-entry:
- %cmp.i = icmp eq i32 0, 0 ; <i1> [#uses=1]
- br i1 %cmp.i, label %if.then.i, label %test_cl.exit
-
-if.then.i: ; preds = %entry
- %val = load <16 x i8> addrspace(1)* null ; <<16 x i8>> [#uses=8]
- %tmp10.i = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 undef, i8 0, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef>, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 29, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp17.i = shufflevector <16 x i8> %tmp10.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 18, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp24.i = shufflevector <16 x i8> %tmp17.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 24, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp31.i = shufflevector <16 x i8> %tmp24.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 21, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp38.i = shufflevector <16 x i8> %tmp31.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 27, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp45.i = shufflevector <16 x i8> %tmp38.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 29, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp52.i = shufflevector <16 x i8> %tmp45.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 21, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef> ; <<16 x i8>> [#uses=1]
- %tmp59.i = shufflevector <16 x i8> %tmp52.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 20> ; <<16 x i8>> [#uses=1]
- store <16 x i8> %tmp59.i, <16 x i8> addrspace(1)* null
- ret void
-
-test_cl.exit: ; preds = %entry
- ret void
-}
diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index 50c62df..ffbe02c 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 > %t1
-; RUN: grep movzwl %t1 | count 2
-; RUN: grep movzbl %t1 | count 1
-; RUN: grep movd %t1 | count 4
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
define <4 x i16> @a(i32* %x1) nounwind {
+; CHECK-LABEL: a:
+; CHECK: shrl %[[R:[^,]+]]
+; CHECK-NEXT: movd %[[R]], %xmm0
+; CHECK-NEXT: retl
+
%x2 = load i32* %x1
%x3 = lshr i32 %x2, 1
%x = trunc i32 %x3 to i16
@@ -12,6 +14,12 @@ define <4 x i16> @a(i32* %x1) nounwind {
}
define <8 x i16> @b(i32* %x1) nounwind {
+; CHECK-LABEL: b:
+; CHECK: shrl %e[[R:.]]x
+; CHECK-NEXT: movzwl %[[R]]x, %e[[R]]x
+; CHECK-NEXT: movd %e[[R]]x, %xmm0
+; CHECK-NEXT: retl
+
%x2 = load i32* %x1
%x3 = lshr i32 %x2, 1
%x = trunc i32 %x3 to i16
@@ -20,6 +28,12 @@ define <8 x i16> @b(i32* %x1) nounwind {
}
define <8 x i8> @c(i32* %x1) nounwind {
+; CHECK-LABEL: c:
+; CHECK: shrl %e[[R:.]]x
+; CHECK-NEXT: movzwl %[[R]]x, %e[[R]]x
+; CHECK-NEXT: movd %e[[R]]x, %xmm0
+; CHECK-NEXT: retl
+
%x2 = load i32* %x1
%x3 = lshr i32 %x2, 1
%x = trunc i32 %x3 to i8
@@ -28,6 +42,12 @@ define <8 x i8> @c(i32* %x1) nounwind {
}
define <16 x i8> @d(i32* %x1) nounwind {
+; CHECK-LABEL: d:
+; CHECK: shrl %e[[R:.]]x
+; CHECK-NEXT: movzbl %[[R]]l, %e[[R]]x
+; CHECK-NEXT: movd %e[[R]]x, %xmm0
+; CHECK-NEXT: retl
+
%x2 = load i32* %x1
%x3 = lshr i32 %x2, 1
%x = trunc i32 %x3 to i8
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index a936edc..6fe2ee4 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
br label %do.body, !dbg !0
do.body: ; preds = %entry
- call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4)
+ call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
%conv = ptrtoint i32* %count_ to i32, !dbg !0 ; <i32> [#uses=1]
%call = call i32 @foo(i32 %conv) ssp, !dbg !0 ; <i32> [#uses=0]
br label %do.end, !dbg !0
@@ -18,17 +18,17 @@ do.end: ; preds = %do.body
ret void, !dbg !7
}
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
declare i32 @foo(i32) ssp
!0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ]
+!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
+!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
!7 = metadata !{i32 6, i32 1, metadata !2, null}
!8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
!9 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index f99e682..0e2ed9d 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -12,7 +12,7 @@ entry:
%retval = alloca double ; <double*> [#uses=2]
%0 = alloca double ; <double*> [#uses=2]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
+ call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
%1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
%2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
%3 = load double* %2, align 8, !dbg !16 ; <double> [#uses=1]
@@ -26,30 +26,30 @@ return: ; preds = %entry
ret double %retval1, !dbg !16
}
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!3}
!llvm.module.flags = !{!21}
-!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
!15 = metadata !{i32 11, i32 0, metadata !1, null}
!16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
!18 = metadata !{metadata !1}
!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
!20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index 4d4e8c1..a35efdc 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -8,28 +8,28 @@
define i32 @"main(tart.core.String[])->int32"(i32 %args) {
entry:
- tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8)
+ tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8, metadata !{metadata !"0x102"})
tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2]
ret i32 3
}
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
-!0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ]
-!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
+!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x26\00\000\00192\0064\000\000", metadata !15, metadata !0, metadata !2} ; [ DW_TAG_const_type ]
+!2 = metadata !{metadata !"0x13\00C\001\00192\0064\000\000\000", metadata !15, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
!3 = metadata !{metadata !4, metadata !6, metadata !7}
-!4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"y", i32 1, i64 64, i64 64, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ]
-!7 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"z", i32 1, i64 64, i64 64, i64 128, i32 0, metadata !5} ; [ DW_TAG_member ]
-!8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0xd\00x\001\0064\0064\000\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!5 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xd\00y\001\0064\0064\0064\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!7 = metadata !{metadata !"0xd\00z\001\0064\0064\00128\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x100\00t\005\000", metadata !9, metadata !0, metadata !2} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xb\000\000\000", null, metadata !10} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !0, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !0, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!12 = metadata !{metadata !13}
-!13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
!14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
!15 = metadata !{metadata !"sm.c", metadata !""}
!16 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
index 5372bc5..60025bf 100644
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
@@ -7,7 +7,7 @@ entry:
%tmp1 = bitcast double %a to <8 x i8>
%tmp2 = bitcast double %b to <8 x i8>
%tmp3 = add <8 x i8> %tmp1, %tmp2
-; CHECK: paddw
+; CHECK: paddb
store <8 x i8> %tmp3, <8 x i8>* null
ret void
}
@@ -18,7 +18,7 @@ entry:
%tmp1 = bitcast double %a to <4 x i16>
%tmp2 = bitcast double %b to <4 x i16>
%tmp3 = add <4 x i16> %tmp1, %tmp2
-; CHECK: paddd
+; CHECK: paddw
store <4 x i16> %tmp3, <4 x i16>* null
ret void
}
@@ -29,7 +29,7 @@ entry:
%tmp1 = bitcast double %a to <2 x i32>
%tmp2 = bitcast double %b to <2 x i32>
%tmp3 = add <2 x i32> %tmp1, %tmp2
-; CHECK: paddq
+; CHECK: paddd
store <2 x i32> %tmp3, <2 x i32>* null
ret void
}
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 7faee99..1998011 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -2,8 +2,7 @@
; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s
; Test to check .debug_loc support. This test case emits many debug_loc entries.
-; CHECK: Loc expr size
-; CHECK-NEXT: .short
+; CHECK: .short {{.*}} # Loc expr size
; CHECK-NEXT: .Ltmp
; CHECK-NEXT: DW_OP_reg
@@ -11,10 +10,10 @@
define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
entry:
- tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0)
- tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11)
- tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12)
- tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13)
+ tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+ tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11, metadata !{metadata !"0x102"})
+ tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12, metadata !{metadata !"0x102"})
+ tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13, metadata !{metadata !"0x102"})
%0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1]
%1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1]
%2 = fcmp olt float %0, %1, !dbg !19 ; <i1> [#uses=1]
@@ -22,34 +21,34 @@ entry:
bb: ; preds = %entry
%3 = fdiv float %c, %d, !dbg !20 ; <float> [#uses=3]
- tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16), !dbg !20
+ tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !20
%4 = fmul float %3, %c, !dbg !21 ; <float> [#uses=1]
%5 = fadd float %4, %d, !dbg !21 ; <float> [#uses=2]
- tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14), !dbg !21
+ tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
%6 = fmul float %3, %a, !dbg !22 ; <float> [#uses=1]
%7 = fadd float %6, %b, !dbg !22 ; <float> [#uses=1]
%8 = fdiv float %7, %5, !dbg !22 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17), !dbg !22
+ tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !22
%9 = fmul float %3, %b, !dbg !23 ; <float> [#uses=1]
%10 = fsub float %9, %a, !dbg !23 ; <float> [#uses=1]
%11 = fdiv float %10, %5, !dbg !23 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18), !dbg !23
+ tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !23
br label %bb2, !dbg !23
bb1: ; preds = %entry
%12 = fdiv float %d, %c, !dbg !24 ; <float> [#uses=3]
- tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16), !dbg !24
+ tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !24
%13 = fmul float %12, %d, !dbg !25 ; <float> [#uses=1]
%14 = fadd float %13, %c, !dbg !25 ; <float> [#uses=2]
- tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14), !dbg !25
+ tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !25
%15 = fmul float %12, %b, !dbg !26 ; <float> [#uses=1]
%16 = fadd float %15, %a, !dbg !26 ; <float> [#uses=1]
%17 = fdiv float %16, %14, !dbg !26 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17), !dbg !26
+ tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !26
%18 = fmul float %12, %a, !dbg !27 ; <float> [#uses=1]
%19 = fsub float %b, %18, !dbg !27 ; <float> [#uses=1]
%20 = fdiv float %19, %14, !dbg !27 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18), !dbg !27
+ tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !27
br label %bb2, !dbg !27
bb2: ; preds = %bb1, %bb
@@ -75,9 +74,9 @@ bb6: ; preds = %bb4
bb8: ; preds = %bb6
%27 = tail call float @copysignf(float 0x7FF0000000000000, float %c) nounwind readnone, !dbg !30 ; <float> [#uses=2]
%28 = fmul float %27, %a, !dbg !30 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17), !dbg !30
+ tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
%29 = fmul float %27, %b, !dbg !31 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18), !dbg !31
+ tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !31
br label %bb46, !dbg !31
bb9: ; preds = %bb6, %bb4
@@ -107,24 +106,24 @@ bb15: ; preds = %bb14
bb16: ; preds = %bb15
%iftmp.0.0 = select i1 %33, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
%42 = tail call float @copysignf(float %iftmp.0.0, float %a) nounwind readnone, !dbg !33 ; <float> [#uses=2]
- tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0), !dbg !33
+ tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !33
%43 = fcmp ord float %b, 0.000000e+00 ; <i1> [#uses=1]
%44 = fsub float %b, %b, !dbg !34 ; <float> [#uses=1]
%45 = fcmp uno float %44, 0.000000e+00 ; <i1> [#uses=1]
%46 = and i1 %43, %45, !dbg !34 ; <i1> [#uses=1]
%iftmp.1.0 = select i1 %46, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
%47 = tail call float @copysignf(float %iftmp.1.0, float %b) nounwind readnone, !dbg !34 ; <float> [#uses=2]
- tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11), !dbg !34
+ tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !34
%48 = fmul float %42, %c, !dbg !35 ; <float> [#uses=1]
%49 = fmul float %47, %d, !dbg !35 ; <float> [#uses=1]
%50 = fadd float %48, %49, !dbg !35 ; <float> [#uses=1]
%51 = fmul float %50, 0x7FF0000000000000, !dbg !35 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17), !dbg !35
+ tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
%52 = fmul float %47, %c, !dbg !36 ; <float> [#uses=1]
%53 = fmul float %42, %d, !dbg !36 ; <float> [#uses=1]
%54 = fsub float %52, %53, !dbg !36 ; <float> [#uses=1]
%55 = fmul float %54, 0x7FF0000000000000, !dbg !36 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18), !dbg !36
+ tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !36
br label %bb46, !dbg !36
bb27: ; preds = %bb15, %bb14, %bb11
@@ -155,24 +154,24 @@ bb34: ; preds = %bb33, %bb30
bb35: ; preds = %bb34
%iftmp.2.0 = select i1 %59, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
%67 = tail call float @copysignf(float %iftmp.2.0, float %c) nounwind readnone, !dbg !38 ; <float> [#uses=2]
- tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12), !dbg !38
+ tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !38
%68 = fcmp ord float %d, 0.000000e+00 ; <i1> [#uses=1]
%69 = fsub float %d, %d, !dbg !39 ; <float> [#uses=1]
%70 = fcmp uno float %69, 0.000000e+00 ; <i1> [#uses=1]
%71 = and i1 %68, %70, !dbg !39 ; <i1> [#uses=1]
%iftmp.3.0 = select i1 %71, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
%72 = tail call float @copysignf(float %iftmp.3.0, float %d) nounwind readnone, !dbg !39 ; <float> [#uses=2]
- tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13), !dbg !39
+ tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !39
%73 = fmul float %67, %a, !dbg !40 ; <float> [#uses=1]
%74 = fmul float %72, %b, !dbg !40 ; <float> [#uses=1]
%75 = fadd float %73, %74, !dbg !40 ; <float> [#uses=1]
%76 = fmul float %75, 0.000000e+00, !dbg !40 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17), !dbg !40
+ tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !40
%77 = fmul float %67, %b, !dbg !41 ; <float> [#uses=1]
%78 = fmul float %72, %a, !dbg !41 ; <float> [#uses=1]
%79 = fsub float %77, %78, !dbg !41 ; <float> [#uses=1]
%80 = fmul float %79, 0.000000e+00, !dbg !41 ; <float> [#uses=1]
- tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18), !dbg !41
+ tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !41
br label %bb46, !dbg !41
bb46: ; preds = %bb35, %bb34, %bb33, %bb30, %bb16, %bb8, %bb2
@@ -196,30 +195,30 @@ declare float @fabsf(float)
declare float @copysignf(float, float) nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!3}
!llvm.module.flags = !{!48}
-!0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00a\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", metadata !45, metadata !2, metadata !4, null, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !45} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !45, metadata !47, metadata !47, metadata !44, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !45, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9}
-!6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
-!8 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"complex float", i32 0, i64 64, i64 32, i64 0, i32 0, i32 3} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SFtype", i32 167, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!10 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !1, metadata !"b", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 786689, metadata !1, metadata !"d", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"denom", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !45, metadata !1, i32 1922, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786688, metadata !15, metadata !"ratio", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 786688, metadata !15, metadata !"x", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0x16\00SCtype\00170\000\000\000\000", metadata !46, metadata !7, metadata !8} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x24\00complex float\000\0064\0032\000\000\003", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x16\00SFtype\00167\000\000\000\000", metadata !46, metadata !7, metadata !10} ; [ DW_TAG_typedef ]
+!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x101\00b\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x101\00c\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{metadata !"0x101\00d\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x100\00denom\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\001922\000\000", metadata !45, metadata !1} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x100\00ratio\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{metadata !"0x100\00x\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0x100\00y\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
!19 = metadata !{i32 1929, i32 0, metadata !15, null}
!20 = metadata !{i32 1931, i32 0, metadata !15, null}
!21 = metadata !{i32 1932, i32 0, metadata !15, null}
@@ -249,4 +248,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
!45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
!46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
!47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index e11b538..09120a1 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10"
define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
entry:
- tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8)
+ tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8, metadata !{metadata !"0x102"})
%0 = getelementptr inbounds %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
%1 = load i32* %0, align 8, !dbg !28 ; <i32> [#uses=1]
tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28
@@ -19,41 +19,41 @@ entry:
declare void @foo(i32) nounwind optsize noinline ssp
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!38}
-!0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31, metadata !37, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
-!5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x34\00ret\00ret\00\007\000\001", metadata !1, metadata !1, metadata !3, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x29", metadata !36} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !36, metadata !37, metadata !37, metadata !32, metadata !31, metadata !37} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
+!4 = metadata !{metadata !"0x101\00x\0012\000", metadata !5, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", metadata !36, metadata !1, metadata !6, null, void (i32)* @foo, null, null, metadata !33} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!7 = metadata !{null, metadata !3}
-!8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x101\00myvar\0017\000", metadata !9, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", metadata !36, metadata !1, metadata !10, null, i8* (%struct.a*)* @bar, null, null, metadata !34} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, null} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x13\00a\002\00128\0064\000\000\000", metadata !36, metadata !1, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
!15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ]
-!17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0xd\00c\003\0032\0032\000\000", metadata !36, metadata !14, metadata !3} ; [ DW_TAG_member ]
+!17 = metadata !{metadata !"0xd\00d\004\0064\0064\0064\000", metadata !36, metadata !14, metadata !13} ; [ DW_TAG_member ]
+!18 = metadata !{metadata !"0x101\00argc\0022\000", metadata !19, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", metadata !36, metadata !1, metadata !20, null, null, null, null, metadata !35} ; [ DW_TAG_subprogram ]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!21 = metadata !{metadata !3, metadata !3, metadata !22}
-!22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ]
-!24 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!25 = metadata !{i32 786689, metadata !19, metadata !"argv", metadata !1, i32 22, metadata !22, i32 0, null} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"e", metadata !1, i32 23, metadata !14, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786443, metadata !36, metadata !19, i32 22, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !24} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
+!25 = metadata !{metadata !"0x101\00argv\0022\000", metadata !19, metadata !1, metadata !22} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x100\00e\0023\000", metadata !27, metadata !1, metadata !14} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0xb\0022\000\000", metadata !36, metadata !19} ; [ DW_TAG_lexical_block ]
!28 = metadata !{i32 18, i32 0, metadata !29, null}
-!29 = metadata !{i32 786443, metadata !36, metadata !9, i32 17, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
+!29 = metadata !{metadata !"0xb\0017\000\001", metadata !36, metadata !9} ; [ DW_TAG_lexical_block ]
!30 = metadata !{i32 19, i32 0, metadata !29, null}
!31 = metadata !{metadata !0}
!32 = metadata !{metadata !5, metadata !9, metadata !19}
@@ -73,18 +73,22 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
; CHECK: Ldebug_loc0:
-; CHECK-NEXT: .quad Lfunc_begin0
-; CHECK-NEXT: .quad [[LABEL]]
+; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0
+; CHECK-NEXT: .quad [[SET1]]
+; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: .quad [[SET2]]
; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}} ## Loc expr size
; CHECK-NEXT: .short Lset{{.*}}
; CHECK-NEXT: Ltmp{{.*}}:
; CHECK-NEXT: .byte 85
; CHECK-NEXT: Ltmp{{.*}}:
-; CHECK-NEXT: .quad [[LABEL]]
-; CHECK-NEXT: .quad [[CLOBBER]]
+; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: .quad [[SET3]]
+; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0
+; CHECK-NEXT: .quad [[SET4]]
; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}} ## Loc expr size
; CHECK-NEXT: .short Lset{{.*}}
; CHECK-NEXT: Ltmp{{.*}}:
; CHECK-NEXT: .byte 83
; CHECK-NEXT: Ltmp{{.*}}:
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 1114c8d..b0a4e8d 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -4,19 +4,19 @@
define i32 @foo(i32 %y) nounwind optsize ssp {
entry:
- tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0)
+ tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0, metadata !{metadata !"0x102"})
%0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
ret i32 %0, !dbg !9
}
declare i32 @zoo(...)
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
define i32 @bar(i32 %x) nounwind optsize ssp {
entry:
- tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7)
- tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0) nounwind
+ tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7, metadata !{metadata !"0x102"})
+ tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0, metadata !{metadata !"0x102"}) nounwind
%0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
%1 = add nsw i32 %0, %x, !dbg !13 ; <i32> [#uses=1]
ret i32 %1, !dbg !13
@@ -25,21 +25,21 @@ entry:
!llvm.dbg.cu = !{!3}
!llvm.module.flags = !{!20}
-!0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00y\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !17, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!5 = metadata !{metadata !6, metadata !6}
-!6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 6, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @bar, null, null, metadata !16, i32 6} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x101\00x\006\000", metadata !8, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @bar, null, null, metadata !16} ; [ DW_TAG_subprogram ]
!9 = metadata !{i32 3, i32 0, metadata !10, null}
-!10 = metadata !{i32 786443, metadata !18, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
!11 = metadata !{i32 1}
!12 = metadata !{i32 3, i32 0, metadata !10, metadata !13}
!13 = metadata !{i32 7, i32 0, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !18, metadata !8, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\006\000\000", metadata !18, metadata !8} ; [ DW_TAG_lexical_block ]
!15 = metadata !{metadata !0}
!16 = metadata !{metadata !7}
!17 = metadata !{metadata !1, metadata !8}
@@ -49,4 +49,4 @@ entry:
;CHECK: DEBUG_VALUE: bar:x <- E
;CHECK: Ltmp
;CHECK: DEBUG_VALUE: foo:y <- 1{{$}}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 4181c26..dea9162 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -10,51 +10,51 @@ target triple = "x86_64-apple-darwin10.2"
define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
entry:
- tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15)
- tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16)
+ tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"})
+ tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16, metadata !{metadata !"0x102"})
%0 = mul nsw i32 %x, 7, !dbg !29 ; <i32> [#uses=1]
%1 = add nsw i32 %0, 1, !dbg !29 ; <i32> [#uses=1]
ret i32 %1, !dbg !29
}
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!4}
!llvm.module.flags = !{!34}
!llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
-!0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
-!3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x101\00this\0011\000", metadata !1, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x13\00foo\003\0032\0032\000\000\000", metadata !31, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\004\004.2.1 LLVM build\001\00\000\00\000", metadata !31, metadata !32, metadata !32, metadata !33, null, null} ; [ DW_TAG_compile_unit ]
!5 = metadata !{metadata !6, metadata !1, metadata !8}
-!6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ]
-!7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00y\008\0032\0032\000\000", metadata !31, metadata !2, metadata !7} ; [ DW_TAG_member ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!10 = metadata !{metadata !7, metadata !11, metadata !7}
-!11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !3, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786689, metadata !8, metadata !"this", metadata !3, i32 15, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x101\00x\0011\000", metadata !1, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00this\0015\000", metadata !8, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00x\0015\000", metadata !8, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00argc\0019\000", metadata !18, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", metadata !31, metadata !3, metadata !19, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!20 = metadata !{metadata !7, metadata !7, metadata !21}
-!21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786689, metadata !18, metadata !"argv", metadata !3, i32 19, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 786688, metadata !26, metadata !"a", metadata !3, i32 20, metadata !2, i32 0, null} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{i32 786443, metadata !31, metadata !27, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 786443, metadata !31, metadata !18, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !3, i32 21, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x101\00argv\0019\000", metadata !18, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x100\00a\0020\000", metadata !26, metadata !3, metadata !2} ; [ DW_TAG_auto_variable ]
+!26 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !27} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !18} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0x100\00b\0021\000", metadata !26, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ]
!29 = metadata !{i32 16, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !31, metadata !8, i32 15, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!30 = metadata !{metadata !"0xb\0015\000\000", metadata !31, metadata !8} ; [ DW_TAG_lexical_block ]
!31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"}
!32 = metadata !{i32 0}
!33 = metadata !{metadata !1, metadata !8, metadata !18}
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index b49aec3..9d65dc1 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -3,29 +3,29 @@
@.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
@.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
@C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0)]
-!38 = metadata !{i32 524329, metadata !109} ; [ DW_TAG_file_type ]
-!39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!38 = metadata !{metadata !"0x29", metadata !109} ; [ DW_TAG_file_type ]
+!39 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !109, metadata !108, metadata !108, null, null, null} ; [ DW_TAG_compile_unit ]
+!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !109, null, metadata !47} ; [ DW_TAG_pointer_type ]
+!47 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !109, null} ; [ DW_TAG_base_type ]
+!97 = metadata !{metadata !"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, metadata !39, metadata !98, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!98 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !109, null, null, metadata !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!99 = metadata !{metadata !100}
-!100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!100 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !109, null} ; [ DW_TAG_base_type ]
!101 = metadata !{[2 x i8*]* @C.9.2167}
-!102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
+!102 = metadata !{metadata !"0x100\00find_strings\0075\000", metadata !103, metadata !38, metadata !104} ; [ DW_TAG_auto_variable ]
+!103 = metadata !{metadata !"0xb\0073\000\000", null, metadata !97} ; [ DW_TAG_lexical_block ]
+!104 = metadata !{metadata !"0x1\00\000\0085312\0064\000\000", metadata !109, null, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
!105 = metadata !{metadata !106}
-!106 = metadata !{i32 524321, i64 0, i64 1333} ; [ DW_TAG_subrange_type ]
+!106 = metadata !{metadata !"0x21\000\001333"} ; [ DW_TAG_subrange_type ]
!107 = metadata !{i32 73, i32 0, metadata !103, null}
!108 = metadata !{i32 0}
!109 = metadata !{metadata !"pbmsrch.c", metadata !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"}
define i32 @main() nounwind ssp {
bb.nph:
- tail call void @llvm.dbg.declare(metadata !101, metadata !102), !dbg !107
+ tail call void @llvm.dbg.declare(metadata !101, metadata !102, metadata !{metadata !"0x102"}), !dbg !107
ret i32 0, !dbg !107
}
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 09e34ef..a613939 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -6,8 +6,8 @@
define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
entry:
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24
- call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24
+ call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+ call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
%0 = icmp ne i32 %i, 0, !dbg !27 ; <i1> [#uses=1]
br i1 %0, label %bb, label %bb1, !dbg !27
@@ -34,7 +34,7 @@ return: ; preds = %bb2
define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 {
entry:
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34
+ call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
%0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
store i8* null, i8** %0, align 8, !dbg !34
%1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -45,14 +45,14 @@ return: ; preds = %entry
ret void, !dbg !35
}
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
define i32 @main() nounwind ssp {
entry:
%0 = alloca %struct.SVal ; <%struct.SVal*> [#uses=3]
%v = alloca %struct.SVal ; <%struct.SVal*> [#uses=4]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41
+ call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
%1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
store i32 1, i32* %1, align 8, !dbg !42
@@ -65,65 +65,65 @@ entry:
%7 = load i32* %6, align 8, !dbg !43 ; <i32> [#uses=1]
store i32 %7, i32* %5, align 8, !dbg !43
%8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
- call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43
+ call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
br label %return, !dbg !45
return: ; preds = %entry
ret i32 0, !dbg !45
}
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!3}
!llvm.module.flags = !{!49}
!46 = metadata !{metadata !16, metadata !17, metadata !20}
-!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !46, null, null} ; [ DW_TAG_compile_unit ]
!4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !47, metadata !1, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !47, metadata !1, metadata !8} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", metadata !47, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!15 = metadata !{null, metadata !12}
-!16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", metadata !47, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", metadata !47, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!22 = metadata !{metadata !13}
-!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
!24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, null} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786448, metadata !47, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
!27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !47, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\0016\000\002", metadata !47, metadata !17} ; [ DW_TAG_lexical_block ]
!29 = metadata !{i32 18, i32 0, metadata !28, null}
!30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, null} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{i32 786470, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ]
+!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
+!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
!34 = metadata !{i32 11, i32 0, metadata !16, null}
!35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{i32 786443, metadata !47, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{i32 786443, metadata !47, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, null} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{i32 786443, metadata !47, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{i32 786443, metadata !47, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\0011\000\001", metadata !47, metadata !37} ; [ DW_TAG_lexical_block ]
+!37 = metadata !{metadata !"0xb\0011\000\000", metadata !47, metadata !16} ; [ DW_TAG_lexical_block ]
+!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
+!39 = metadata !{metadata !"0xb\0023\000\004", metadata !47, metadata !40} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0023\000\003", metadata !47, metadata !20} ; [ DW_TAG_lexical_block ]
!41 = metadata !{i32 24, i32 0, metadata !39, null}
!42 = metadata !{i32 25, i32 0, metadata !39, null}
!43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
+!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
!45 = metadata !{i32 27, i32 0, metadata !39, null}
!47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
!48 = metadata !{i32 0}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index a65b632..f52e922 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -15,21 +15,21 @@ entry:
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!17}
-!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", metadata !14, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", metadata !15, metadata !16, metadata !16, metadata !13, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !14, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", metadata !15, metadata !7, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
!8 = metadata !{i32 53, i32 13, metadata !9, null}
-!9 = metadata !{i32 786443, metadata !14, metadata !0, i32 53, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0xb\0053\0011\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ]
!10 = metadata !{i32 4, i32 13, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !15, metadata !12, i32 4, i32 13, i32 2} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 786443, metadata !15, metadata !6, i32 4, i32 11, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0xb\004\0013\002", metadata !15, metadata !12} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0xb\004\0011\001", metadata !15, metadata !6} ; [ DW_TAG_lexical_block ]
!13 = metadata !{metadata !0, metadata !6}
!14 = metadata !{metadata !"", metadata !"/private/tmp"}
!15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"}
!16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 21ac7c9..53fb0af 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -9,32 +9,32 @@ target triple = "i386-apple-darwin11.0.0"
define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp {
; CHECK: TAG_formal_parameter
entry:
- tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6), !dbg !12
+ tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
ret i32 1, !dbg !13
}
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!19}
-!0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !17, metadata !1, metadata !3, null, i32 (%struct.bar*)* @foo, null, null, metadata !16} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, metadata !15, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !17, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !17, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\003\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !17, metadata !1, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x13\00bar\002\0064\0032\000\000\000", metadata !17, metadata !1, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
!9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ]
+!11 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ]
!12 = metadata !{i32 3, i32 47, metadata !0, null}
!13 = metadata !{i32 4, i32 2, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !17, metadata !0, i32 3, i32 50, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\003\0050\000", metadata !17, metadata !0} ; [ DW_TAG_lexical_block ]
!15 = metadata !{metadata !0}
!16 = metadata !{metadata !6}
!17 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
!18 = metadata !{i32 0}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 625a351..ac7fbf2 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-apple-darwin10.0.0"
define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp {
entry:
- tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10), !dbg !18
- tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11), !dbg !19
+ tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18
+ tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !19
br label %while.body, !dbg !20
while.body: ; preds = %while.body, %entry
@@ -34,14 +34,14 @@ while.body: ; preds = %while.body, %entry
br i1 %cmp, label %if.then, label %while.body, !dbg !23
if.then: ; preds = %while.body
- tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12), !dbg !21
+ tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !21
ret i64 %b.addr.0, !dbg !23
}
define i32 @main() nounwind optsize ssp {
entry:
%call = tail call i32 @rand() nounwind optsize, !dbg !24
- tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14), !dbg !24
+ tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !24
%cmp = icmp ugt i32 %call, 21, !dbg !25
br i1 %cmp, label %cond.true, label %cond.end, !dbg !25
@@ -51,7 +51,7 @@ cond.true: ; preds = %entry
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %call1, %cond.true ], [ %call, %entry ], !dbg !25
- tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17), !dbg !25
+ tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !25
%conv = sext i32 %cond to i64, !dbg !26
%conv5 = zext i32 %call to i64, !dbg !26
%call6 = tail call i64 @gcd(i64 %conv, i64 %conv5) optsize, !dbg !26
@@ -71,36 +71,36 @@ declare i32 @rand() optsize
declare i32 @printf(i8* nocapture, ...) nounwind optsize
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
declare i32 @puts(i8* nocapture) nounwind
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!33}
-!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
-!1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", metadata !31, metadata !1, metadata !3, null, i64 (i64, i64)* @gcd, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
+!1 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", metadata !31, metadata !32, metadata !32, metadata !28, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
-!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", metadata !31, metadata !1, metadata !7, null, i32 ()* @main, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786688, metadata !13, metadata !"c", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{i32 786443, metadata !31, metadata !0, i32 5, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"m", metadata !1, i32 26, metadata !16, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !31, metadata !6, i32 25, i32 12, i32 2} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786688, metadata !15, metadata !"z_s", metadata !1, i32 27, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x101\00a\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x100\00c\006\000", metadata !13, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0xb\005\0052\000", metadata !31, metadata !0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0x100\00m\0026\000", metadata !15, metadata !1, metadata !16} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\0025\0012\002", metadata !31, metadata !6} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x100\00z_s\0027\000", metadata !15, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ]
!18 = metadata !{i32 5, i32 41, metadata !0, null}
!19 = metadata !{i32 5, i32 49, metadata !0, null}
!20 = metadata !{i32 7, i32 5, metadata !13, null}
!21 = metadata !{i32 8, i32 9, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !31, metadata !13, i32 7, i32 14, i32 1} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\007\0014\001", metadata !31, metadata !13} ; [ DW_TAG_lexical_block ]
!23 = metadata !{i32 9, i32 9, metadata !22, null}
!24 = metadata !{i32 26, i32 38, metadata !15, null}
!25 = metadata !{i32 27, i32 38, metadata !15, null}
@@ -111,4 +111,4 @@ declare i32 @puts(i8* nocapture) nounwind
!30 = metadata !{metadata !14, metadata !17}
!31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"}
!32 = metadata !{i32 0}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2011-08-29-InitOrder.ll b/test/CodeGen/X86/2011-08-29-InitOrder.ll
index a95dcb5..b278ad6 100644
--- a/test/CodeGen/X86/2011-08-29-InitOrder.ll
+++ b/test/CodeGen/X86/2011-08-29-InitOrder.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s --check-prefix=CHECK-DEFAULT
+; RUN: llc < %s -mtriple=i386-linux-gnu -use-ctors | FileCheck %s --check-prefix=CHECK-DEFAULT
; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s --check-prefix=CHECK-DARWIN
; PR5329
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 16706ae..6651af7 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -8,7 +8,7 @@
;CHECK: vpxor
;CHECK: vinserti128
;CHECK: vpshufd
-;CHECK: vpshufd
+;CHECK: vpbroadcastd
;CHECK: vmulps
;CHECK: vmulps
;CHECK: ret
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 1c39c74..519c7ca 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s
declare x86_fastcallcc i64 @barrier()
diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
index 8d914db..403d21a 100644
--- a/test/CodeGen/X86/2012-10-02-DAGCycle.ll
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
-; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s > /dev/null
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s > /dev/null
; rdar://12393897
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 62ee1e1..1a5efda 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -12,11 +12,11 @@
%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
entry:
- call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4)
+ call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4, metadata !{metadata !"0x102"})
%type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0
%0 = load i16* %type, align 2
%cmp = icmp eq i16 %0, 1
@@ -33,16 +33,20 @@ return: ; preds = %for.cond.preheader,
ret i16 %retval.0
}
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!12}
-!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !13, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
!2 = metadata !{}
-!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
-!5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
-!6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x101\00hg\0067109589\000", null, metadata !5, metadata !6} ; [ DW_TAG_arg_variable ] [hg] [line 725]
+!5 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x16\00hgstruct\00492\000\000\000\000", metadata !11, null, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x13\00\00487\00512\0064\000\000\000", metadata !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
!11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = metadata !{metadata !14}
+!14 = metadata !{metadata !"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", metadata !11, metadata !5, metadata !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 36667de..083aacd 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -12,7 +12,7 @@
@.str15 = external hidden unnamed_addr constant [6 x i8], align 1
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp {
entry:
@@ -43,7 +43,7 @@ if.then3344:
br label %if.then4073
if.then4073: ; preds = %if.then3344
- call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4)
+ call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4, metadata !{metadata !"0x102"})
%arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0
%0 = load i32* undef, align 4
%add4093 = add nsw i32 %0, 0
@@ -65,26 +65,31 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!35}
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !19, metadata !2, metadata !2, metadata !20, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
!1 = metadata !{metadata !2}
!2 = metadata !{}
-!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
-!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
-!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!4 = metadata !{metadata !"0x100\00num1\00815\000", metadata !5, metadata !14, metadata !15} ; [ DW_TAG_auto_variable ] [num1] [line 815]
+!5 = metadata !{metadata !"0xb\00815\000\00177", metadata !14, metadata !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = metadata !{metadata !"0xb\00812\000\00176", metadata !14, metadata !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = metadata !{metadata !"0xb\00807\000\00175", metadata !14, metadata !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = metadata !{metadata !"0xb\00440\000\0094", metadata !14, metadata !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = metadata !{metadata !"0xb\00435\000\0091", metadata !14, metadata !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = metadata !{metadata !"0xb\00434\000\0090", metadata !14, metadata !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = metadata !{metadata !"0xb\00250\000\0024", metadata !14, metadata !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = metadata !{metadata !"0xb\00249\000\0023", metadata !14, metadata !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{metadata !"0xb\00221\000\0019", metadata !14, metadata !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!14 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!15 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
!17 = metadata !{metadata !18}
-!18 = metadata !{i32 786465, i64 0, i64 20} ; [ DW_TAG_subrange_type ] [0, 19]
+!18 = metadata !{metadata !"0x21\000\0020"} ; [ DW_TAG_subrange_type ] [0, 19]
!19 = metadata !{metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset"}
+!20 = metadata !{metadata !21}
+!21 = metadata !{metadata !"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null}
+
; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
;
; CHECK: @main
@@ -103,7 +108,7 @@ cond.true: ; preds = %entry
unreachable
cond.end: ; preds = %entry
- call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31)
+ call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31, metadata !{metadata !"0x102"})
%_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
invoke void @_Znwm()
to label %exit.i unwind label %lpad2.i.i.i.i
@@ -129,9 +134,11 @@ declare void @_Znwm()
!llvm.dbg.cu = !{!30}
-!30 = metadata !{i32 786449, metadata !34, i32 4, metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
-!31 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !32, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29]
-!32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
-!33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ]
+!30 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", metadata !34, metadata !2, metadata !2, metadata !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
+!31 = metadata !{metadata !"0x100\00X\0029\000", null, null, metadata !32} ; [ DW_TAG_auto_variable ] [X] [line 29]
+!32 = metadata !{metadata !"0x16\00HM\0028\000\000\000\000", metadata !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
+!33 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ]
!34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!36 = metadata !{metadata !37}
+!37 = metadata !{metadata !"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main]
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 5aec3d9..458ce4f 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -9,7 +9,7 @@
%struct.btCompoundLeafCallback = type { i32, i32 }
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
define void @test() unnamed_addr uwtable ssp align 2 {
entry:
@@ -20,7 +20,7 @@ if.then: ; preds = %entry
unreachable
if.end: ; preds = %entry
- call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3)
+ call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3, metadata !{metadata !"0x102"})
%m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
store i32 0, i32* undef, align 8
%cmp12447 = icmp sgt i32 undef, 0
@@ -36,11 +36,13 @@ invoke.cont44: ; preds = %if.end
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8}
-!0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
-!2 = metadata !{null}
-!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
-!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
-!5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", metadata !6, null, null, metadata !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{metadata !"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", metadata !6, metadata !5, metadata !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test]
+!3 = metadata !{metadata !"0x100\00callback\00214\000", null, null, metadata !4} ; [ DW_TAG_auto_variable ] [callback] [line 214]
+!4 = metadata !{metadata !"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", metadata !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
!6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"}
-!7 = metadata !{i32 0}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = metadata !{null}
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
index bbba796..10dc927 100644
--- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -6,7 +6,7 @@
; we may reference variables that were not live across basic blocks
; resulting in undefined virtual registers.
;
-; In this example, this is illustrated by a the spill/reload of the
+; In this example, this is illustrated by a spill/reload of the
; LOADED_PTR_SLOT.
;
; Before this patch, the compiler was accessing two different spill
diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
new file mode 100644
index 0000000..f65d7c9
--- /dev/null
+++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - | llvm-objdump -d -unwind-info -s - | FileCheck %s
+; Regression test for http://llvm.org/bugs/show_bug.cgi?id=20800.
+
+; ModuleID = 'asan_report.ii'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@.str = private unnamed_addr constant [3 x i8] c"=>\00", align 1
+@.str1 = private unnamed_addr constant [3 x i8] c" \00", align 1
+@.str2 = private unnamed_addr constant [6 x i8] c"%s%p:\00", align 1
+
+; CHECK: ___asan_report_error:
+
+; subq instruction starts at 0x0a, so the second byte of the compact encoding
+; (UNWIND_X86_64_FRAMELESS_STACK_SIZE in mach-o/compact_unwind_encoding.h)
+; must be 0x0d.
+; CHECK: {{a:.*subq.*%rsp}}
+
+; CHECK: Contents of __compact_unwind section
+; CHECK: ___asan_report_error
+
+; Because of incorrect push instruction size in X86AsmBackend.cpp the stack
+; size was also calculated incorrectly.
+; CHECK-NOT: {{compact encoding:.*0x0309f800}}
+; CHECK: {{compact encoding:.*0x030df800}}
+
+define void @__asan_report_error() #0 {
+ %str.i = alloca i64, align 8
+ %stack = alloca [256 x i64], align 8
+ br label %print_shadow_bytes.exit.i
+
+print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0
+ %iv.i = phi i64 [ -5, %0 ], [ %iv.next.i, %print_shadow_bytes.exit.i ]
+ %reg15 = icmp eq i64 %iv.i, 0
+ %.str..str1.i = select i1 %reg15, [3 x i8]* @.str, [3 x i8]* @.str1
+ %reg16 = getelementptr inbounds [3 x i8]* %.str..str1.i, i64 0, i64 0
+ %reg17 = shl i64 %iv.i, 1
+ %reg19 = inttoptr i64 %reg17 to i8*
+ call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
+ %iv.next.i = add nsw i64 %iv.i, 0
+ br label %print_shadow_bytes.exit.i
+}
+
+declare void @append(i64*, i8*, ...)
+
+attributes #0 = { "no-frame-pointer-elim"="false" }
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 4ce2fb3..54d8f65 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -4,10 +4,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "x86_64-apple-macosx10.7.0"
define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
- tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6), !dbg !12
+ tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
%ab = load i32* %c, align 1, !dbg !14
- tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7), !dbg !13
- tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10), !dbg !14
+ tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !13
+ tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !14
%cd = icmp eq i32 %i, 42, !dbg !15
br i1 %cd, label %bb1, label %bb2, !dbg !15
@@ -23,23 +23,23 @@ bb2:
ret i32 %.0, !dbg !17
}
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22}
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, i32 (i32, i32*)* @foo, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
!12 = metadata !{i32 2, i32 13, metadata !1, null}
!13 = metadata !{i32 2, i32 22, metadata !1, null}
!14 = metadata !{i32 3, i32 14, metadata !11, null}
@@ -50,4 +50,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
!19 = metadata !{metadata !6, metadata !7, metadata !10}
!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
!21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 51d0d17..6865873 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -5,7 +5,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
define void @foo() nounwind uwtable ssp {
entry:
@@ -17,7 +17,7 @@ entry:
for.body:
call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
- call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+ call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22, metadata !{metadata !"0x102"}) nounwind
br label %for.body
}
@@ -27,9 +27,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!23}
-!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !1, metadata !2, metadata !2, null, null, null} ; [ DW_TAG_compile_unit ]
!1 = metadata !{metadata !"t.c", metadata !""}
-!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
!2 = metadata !{i32 0}
-!22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{metadata !"0x100\00x\0016\000", null, metadata !2, metadata !16} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index 100817a..a435272 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s
; Check that we perform a scalar XOR on i32.
diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
new file mode 100644
index 0000000..8c66412
--- /dev/null
+++ b/test/CodeGen/X86/TruncAssertZext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s
+; Checks that a zeroing mov is inserted for the trunc/zext pair even when
+; the source of the zext is an AssertSext node
+; PR20494
+
+define i64 @main(i64 %a) {
+; CHECK-LABEL: main
+; CHECK: movl %e{{..}}, %eax
+; CHECK: ret
+ %or = or i64 %a, -2
+ %trunc = trunc i64 %or to i32
+ br label %l
+l:
+ %ext = zext i32 %trunc to i64
+ ret i64 %ext
+}
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index 1513fcb..9c24be4 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -4,7 +4,7 @@
define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
entry:
; CHECK-LABEL: test1:
-; CHECK: cmpl %ecx, %eax
+; CHECK: cmpl %ecx, %eax
; CHECK-NOT: addl
; CHECK: adcl $0, %eax
%add4 = add i32 %x, %sum
diff --git a/test/CodeGen/X86/add_shl_constant.ll b/test/CodeGen/X86/add_shl_constant.ll
new file mode 100644
index 0000000..33074e4
--- /dev/null
+++ b/test/CodeGen/X86/add_shl_constant.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+; CHECK-LABEL: add_shl_add_constant_1_i32
+; CHECK: leal 984(%rsi,%rdi,8), %eax
+; CHECK-NEXT: retq
+define i32 @add_shl_add_constant_1_i32(i32 %x, i32 %y) nounwind {
+ %add.0 = add i32 %x, 123
+ %shl = shl i32 %add.0, 3
+ %add.1 = add i32 %shl, %y
+ ret i32 %add.1
+}
+
+; CHECK-LABEL: add_shl_add_constant_2_i32
+; CHECK: leal 984(%rsi,%rdi,8), %eax
+; CHECK-NEXT: retq
+define i32 @add_shl_add_constant_2_i32(i32 %x, i32 %y) nounwind {
+ %add.0 = add i32 %x, 123
+ %shl = shl i32 %add.0, 3
+ %add.1 = add i32 %y, %shl
+ ret i32 %add.1
+}
+
+; CHECK: LCPI2_0:
+; CHECK: .long 984
+; CHECK: _add_shl_add_constant_1_v4i32
+; CHECK: pslld $3, %[[REG:xmm[0-9]+]]
+; CHECK: paddd %xmm1, %[[REG]]
+; CHECK: paddd LCPI2_0(%rip), %[[REG:xmm[0-9]+]]
+; CHECK: retq
+define <4 x i32> @add_shl_add_constant_1_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+ %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123>
+ %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3>
+ %add.1 = add <4 x i32> %shl, %y
+ ret <4 x i32> %add.1
+}
+
+; CHECK: LCPI3_0:
+; CHECK: .long 984
+; CHECK: _add_shl_add_constant_2_v4i32
+; CHECK: pslld $3, %[[REG:xmm[0-9]+]]
+; CHECK: paddd %xmm1, %[[REG]]
+; CHECK: paddd LCPI3_0(%rip), %[[REG:xmm[0-9]+]]
+; CHECK: retq
+define <4 x i32> @add_shl_add_constant_2_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+ %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123>
+ %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3>
+ %add.1 = add <4 x i32> %y, %shl
+ ret <4 x i32> %add.1
+}
diff --git a/test/CodeGen/X86/addr-mode-matcher.ll b/test/CodeGen/X86/addr-mode-matcher.ll
new file mode 100644
index 0000000..d592091
--- /dev/null
+++ b/test/CodeGen/X86/addr-mode-matcher.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+; This testcase used to hit an assert during ISel. For details, see the big
+; comment inside the function.
+
+; CHECK-LABEL: foo:
+; The AND should be turned into a subreg access.
+; CHECK-NOT: and
+; The shift (leal) should be folded into the scale of the address in the load.
+; CHECK-NOT: leal
+; CHECK: movl {{.*}},4),
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.6.0"
+
+define void @foo(i32 %a) {
+bb:
+ br label %bb1692
+
+bb1692:
+ %tmp1694 = phi i32 [ 0, %bb ], [ %tmp1745, %bb1692 ]
+ %xor = xor i32 0, %tmp1694
+
+; %load1 = (load (and (shl %xor, 2), 1020))
+ %tmp1701 = shl i32 %xor, 2
+ %tmp1702 = and i32 %tmp1701, 1020
+ %tmp1703 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1702
+ %tmp1704 = bitcast i8* %tmp1703 to i32*
+ %load1 = load i32* %tmp1704, align 4
+
+; %load2 = (load (shl (and %xor, 255), 2))
+ %tmp1698 = and i32 %xor, 255
+ %tmp1706 = shl i32 %tmp1698, 2
+ %tmp1707 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1706
+ %tmp1708 = bitcast i8* %tmp1707 to i32*
+ %load2 = load i32* %tmp1708, align 4
+
+ %tmp1710 = or i32 %load2, %a
+
+; While matching xor we address-match %load1. The and-of-shift reassocication
+; in address matching transform this into into a shift-of-and and the resuting
+; node becomes identical to %load2. CSE replaces %load1 which leaves its
+; references in MatchScope and RecordedNodes stale.
+ %tmp1711 = xor i32 %load1, %tmp1710
+
+ %tmp1744 = getelementptr inbounds [256 x i32]* null, i32 0, i32 %tmp1711
+ store i32 0, i32* %tmp1744, align 4
+ %tmp1745 = add i32 %tmp1694, 1
+ indirectbr i8* undef, [label %bb1756, label %bb1692]
+
+bb1756:
+ br label %bb2705
+
+bb2705:
+ indirectbr i8* undef, [label %bb5721, label %bb5736]
+
+bb5721:
+ br label %bb2705
+
+bb5736:
+ ret void
+}
diff --git a/test/CodeGen/X86/address-type-promotion-constantexpr.ll b/test/CodeGen/X86/address-type-promotion-constantexpr.ll
new file mode 100644
index 0000000..32f29bd
--- /dev/null
+++ b/test/CodeGen/X86/address-type-promotion-constantexpr.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux
+
+; PR20314 is a crashing bug. This program does nothing with the load, so just check that the return is 0.
+
+@c = common global [2 x i32] zeroinitializer, align 4
+@a = common global i32 0, align 4
+@b = internal unnamed_addr constant [2 x i8] c"\01\00", align 1
+
+; CHECK-LABEL: main
+; CHECK: xor %eax, %eax
+define i32 @main() {
+entry:
+ %foo = load i8* getelementptr ([2 x i8]* @b, i64 0, i64 sext (i8 or (i8 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i8), i8 1) to i64)), align 1
+ ret i32 0
+}
+
diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
new file mode 100644
index 0000000..0498177
--- /dev/null
+++ b/test/CodeGen/X86/adx-intrinsics.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding| FileCheck %s --check-prefix=ADX --check-prefix=CHECK
+
+declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
+
+define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarryx_u32
+; CHECK: addb
+; ADX: adcxl
+; CHECK: setb
+; CHECK: retq
+ %ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+ ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
+
+define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarryx_u64
+; CHECK: addb
+; ADX: adcxq
+; CHECK: setb
+; CHECK: retq
+ %ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+ ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
+
+define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_u32
+; CHECK: addb
+; ADX: adcxl
+; NOADX: adcl
+; CHECK: setb
+; CHECK: retq
+ %ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+ ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
+
+define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_u64
+; CHECK: addb
+; ADX: adcxq
+; NOADX: adcq
+; CHECK: setb
+; CHECK: retq
+ %ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+ ret i8 %ret;
+}
+
+declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)
+
+define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_u32
+; CHECK: addb
+; CHECK: sbbl
+; CHECK: setb
+; CHECK: retq
+ %ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+ ret i8 %ret;
+}
+
+declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)
+
+define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_u64
+; CHECK: addb
+; CHECK: sbbq
+; CHECK: setb
+; CHECK: retq
+ %ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+ ret i8 %ret;
+}
+
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index bf55644..82a8e48 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -30,12 +30,12 @@ define i32 @foo_f() {
ret i32 0
}
; CHECK-DAG: .weak bar_f
-@bar_f = alias weak %FunTy* @foo_f
+@bar_f = weak alias %FunTy* @foo_f
-@bar_l = alias linkonce_odr i32* @bar
+@bar_l = linkonce_odr alias i32* @bar
; CHECK-DAG: .weak bar_l
-@bar_i = alias internal i32* @bar
+@bar_i = internal alias i32* @bar
; CHECK-DAG: .globl A
@A = alias bitcast (i32* @bar to i64*)
diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll
new file mode 100644
index 0000000..e2155fe
--- /dev/null
+++ b/test/CodeGen/X86/aligned-variadic.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=X32
+
+%struct.Baz = type { [17 x i8] }
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+; Function Attrs: nounwind uwtable
+define void @bar(%struct.Baz* byval nocapture readnone align 8 %x, ...) {
+entry:
+ %va = alloca [1 x %struct.__va_list_tag], align 16
+ %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0
+ %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %va to i8*
+ call void @llvm.va_start(i8* %arraydecay1)
+ %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0, i32 2
+ %overflow_arg_area = load i8** %overflow_arg_area_p, align 8
+ %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 24
+ store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
+; X32: leal 68(%esp), [[REG:%.*]]
+; X32: movl [[REG]], 16(%esp)
+; X64: leaq 232(%rsp), [[REG:%.*]]
+; X64: movq [[REG]], 184(%rsp)
+; X64: leaq 176(%rsp), %rdi
+ call void @qux(%struct.__va_list_tag* %arraydecay)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*)
+
+declare void @qux(%struct.__va_list_tag*)
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 74b9470..9d8b6cf 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI
declare void @bar(<2 x i64>* %n)
@@ -6,15 +7,29 @@ define void @foo(i64 %h) {
%p = alloca <2 x i64>, i64 %h
call void @bar(<2 x i64>* %p)
ret void
-; CHECK: foo
+; CHECK-LABEL: foo
; CHECK-NOT: andq $-32, %rax
+; X32ABI-LABEL: foo
+; X32ABI-NOT: andl $-32, %eax
}
define void @foo2(i64 %h) {
%p = alloca <2 x i64>, i64 %h, align 32
call void @bar(<2 x i64>* %p)
ret void
-; CHECK: foo2
+; CHECK-LABEL: foo2
; CHECK: andq $-32, %rsp
; CHECK: andq $-32, %rax
+; X32ABI-LABEL: foo2
+; X32ABI: andl $-32, %esp
+; X32ABI: andl $-32, %eax
+}
+
+define void @foo3(i64 %h) {
+ %p = alloca <2 x i64>, i64 %h
+ ret void
+; CHECK-LABEL: foo3
+; CHECK: movq %rbp, %rsp
+; X32ABI-LABEL: foo3
+; X32ABI: movl %ebp, %esp
}
diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll
index 6dbfb16..9352438 100644
--- a/test/CodeGen/X86/asm-block-labels.ll
+++ b/test/CodeGen/X86/asm-block-labels.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc -no-integrated-as
+; RUN: opt < %s -O3 | llc -no-integrated-as
; ModuleID = 'block12.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll
index 7352d5a..ad1a5c6 100644
--- a/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/test/CodeGen/X86/atomic-load-store-wide.ll
@@ -4,16 +4,18 @@
; FIXME: The generated code can be substantially improved.
define void @test1(i64* %ptr, i64 %val1) {
-; CHECK: test1
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test1
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
; CHECK-NEXT: jne
store atomic i64 %val1, i64* %ptr seq_cst, align 8
ret void
}
define i64 @test2(i64* %ptr) {
-; CHECK: test2
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test2
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
%val = load atomic i64* %ptr seq_cst, align 8
ret i64 %val
}
diff --git a/test/CodeGen/X86/atomic-ops-ancient-64.ll b/test/CodeGen/X86/atomic-ops-ancient-64.ll
index 18749b9..508d83b 100644
--- a/test/CodeGen/X86/atomic-ops-ancient-64.ll
+++ b/test/CodeGen/X86/atomic-ops-ancient-64.ll
@@ -1,4 +1,5 @@
; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; XFAIL: *
define i64 @test_add(i64* %addr, i64 %inc) {
; CHECK-LABEL: test_add:
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index bdd25e6..f60212d 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
; rdar://7103704
@@ -14,6 +15,8 @@ define void @inc4(i64* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: inc4:
; CHECK: incq
+; SLOW_INC-LABEL: inc4:
+; SLOW_INC-NOT: incq
%0 = atomicrmw add i64* %p, i64 1 monotonic
ret void
}
@@ -39,6 +42,8 @@ define void @inc3(i8* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: inc3:
; CHECK: incb
+; SLOW_INC-LABEL: inc3:
+; SLOW_INC-NOT: incb
%0 = atomicrmw add i8* %p, i8 1 monotonic
ret void
}
@@ -64,6 +69,8 @@ define void @inc2(i16* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: inc2:
; CHECK: incw
+; SLOW_INC-LABEL: inc2:
+; SLOW_INC-NOT: incw
%0 = atomicrmw add i16* %p, i16 1 monotonic
ret void
}
@@ -89,6 +96,8 @@ define void @inc1(i32* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: inc1:
; CHECK: incl
+; SLOW_INC-LABEL: inc1:
+; SLOW_INC-NOT: incl
%0 = atomicrmw add i32* %p, i32 1 monotonic
ret void
}
@@ -113,6 +122,8 @@ define void @dec4(i64* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: dec4:
; CHECK: decq
+; SLOW_INC-LABEL: dec4:
+; SLOW_INC-NOT: decq
%0 = atomicrmw sub i64* %p, i64 1 monotonic
ret void
}
@@ -138,6 +149,8 @@ define void @dec3(i8* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: dec3:
; CHECK: decb
+; SLOW_INC-LABEL: dec3:
+; SLOW_INC-NOT: decb
%0 = atomicrmw sub i8* %p, i8 1 monotonic
ret void
}
@@ -163,6 +176,8 @@ define void @dec2(i16* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: dec2:
; CHECK: decw
+; SLOW_INC-LABEL: dec2:
+; SLOW_INC-NOT: decw
%0 = atomicrmw sub i16* %p, i16 1 monotonic
ret void
}
@@ -189,6 +204,8 @@ define void @dec1(i32* nocapture %p) nounwind ssp {
entry:
; CHECK-LABEL: dec1:
; CHECK: decl
+; SLOW_INC-LABEL: dec1:
+; SLOW_INC-NOT: decl
%0 = atomicrmw sub i32* %p, i32 1 monotonic
ret void
}
diff --git a/test/CodeGen/X86/atomic_idempotent.ll b/test/CodeGen/X86/atomic_idempotent.ll
new file mode 100644
index 0000000..1afc535
--- /dev/null
+++ b/test/CodeGen/X86/atomic_idempotent.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimization) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(i8* %p) {
+; CHECK-LABEL: add8
+; CHECK: mfence
+; CHECK: movb
+ %1 = atomicrmw add i8* %p, i8 0 monotonic
+ ret i8 %1
+}
+
+define i16 @or16(i16* %p) {
+; CHECK-LABEL: or16
+; CHECK: mfence
+; CHECK: movw
+ %1 = atomicrmw or i16* %p, i16 0 acquire
+ ret i16 %1
+}
+
+define i32 @xor32(i32* %p) {
+; CHECK-LABEL: xor32
+; CHECK: mfence
+; CHECK: movl
+ %1 = atomicrmw xor i32* %p, i32 0 release
+ ret i32 %1
+}
+
+define i64 @sub64(i64* %p) {
+; CHECK-LABEL: sub64
+; X64: mfence
+; X64: movq
+; X32-NOT: mfence
+ %1 = atomicrmw sub i64* %p, i64 0 seq_cst
+ ret i64 %1
+}
+
+define i128 @or128(i128* %p) {
+; CHECK-LABEL: or128
+; CHECK-NOT: mfence
+ %1 = atomicrmw or i128* %p, i128 0 monotonic
+ ret i128 %1
+}
+
+; For 'and', the idempotent value is (-1)
+define i32 @and32 (i32* %p) {
+; CHECK-LABEL: and32
+; CHECK: mfence
+; CHECK: movl
+ %1 = atomicrmw and i32* %p, i32 -1 acq_rel
+ ret i32 %1
+}
diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
new file mode 100644
index 0000000..19e019e
--- /dev/null
+++ b/test/CodeGen/X86/atomic_mi.ll
@@ -0,0 +1,525 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64
+; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32
+; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
+
+; This file checks that atomic (non-seq_cst) stores of immediate values are
+; done in one mov instruction and not 2. More precisely, it makes sure that the
+; immediate is not first copied uselessly into a register.
+
+; Similarily, it checks that a binary operation of an immediate with an atomic
+; variable that is stored back in that variable is done as a single instruction.
+; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
+; should be just an add instruction, instead of loading x into a register, doing
+; an add and storing the result back.
+; The binary operations supported are currently add, and, or, xor.
+; sub is not supported because they are translated by an addition of the
+; negated immediate.
+; Finally, we also check the same kind of pattern for inc/dec
+
+; seq_cst stores are left as (lock) xchgl, but we try to check every other
+; attribute at least once.
+
+; Please note that these operations do not require the lock prefix: only
+; sequentially consistent stores require this kind of protection on X86.
+; And even for seq_cst operations, llvm uses the xchg instruction which has
+; an implicit lock prefix, so making it explicit is not required.
+
+define void @store_atomic_imm_8(i8* %p) {
+; X64-LABEL: store_atomic_imm_8
+; X64: movb
+; X64-NOT: movb
+; X32-LABEL: store_atomic_imm_8
+; X32: movb
+; X32-NOT: movb
+ store atomic i8 42, i8* %p release, align 1
+ ret void
+}
+
+define void @store_atomic_imm_16(i16* %p) {
+; X64-LABEL: store_atomic_imm_16
+; X64: movw
+; X64-NOT: movw
+; X32-LABEL: store_atomic_imm_16
+; X32: movw
+; X32-NOT: movw
+ store atomic i16 42, i16* %p monotonic, align 2
+ ret void
+}
+
+define void @store_atomic_imm_32(i32* %p) {
+; X64-LABEL: store_atomic_imm_32
+; X64: movl
+; X64-NOT: movl
+; On 32 bits, there is an extra movl for each of those functions
+; (probably for alignment reasons).
+; X32-LABEL: store_atomic_imm_32
+; X32: movl 4(%esp), %eax
+; X32: movl
+; X32-NOT: movl
+ store atomic i32 42, i32* %p release, align 4
+ ret void
+}
+
+define void @store_atomic_imm_64(i64* %p) {
+; X64-LABEL: store_atomic_imm_64
+; X64: movq
+; X64-NOT: movq
+; These are implemented with a CAS loop on 32 bit architectures, and thus
+; cannot be optimized in the same way as the others.
+; X32-LABEL: store_atomic_imm_64
+; X32: cmpxchg8b
+ store atomic i64 42, i64* %p release, align 8
+ ret void
+}
+
+; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
+; even on X64, one must use movabsq that can only target a register.
+define void @store_atomic_imm_64_big(i64* %p) {
+; X64-LABEL: store_atomic_imm_64_big
+; X64: movabsq
+; X64: movq
+ store atomic i64 100000000000, i64* %p monotonic, align 8
+ ret void
+}
+
+; It would be incorrect to replace a lock xchgl by a movl
+define void @store_atomic_imm_32_seq_cst(i32* %p) {
+; X64-LABEL: store_atomic_imm_32_seq_cst
+; X64: xchgl
+; X32-LABEL: store_atomic_imm_32_seq_cst
+; X32: xchgl
+ store atomic i32 42, i32* %p seq_cst, align 4
+ ret void
+}
+
+; ----- ADD -----
+
+define void @add_8(i8* %p) {
+; X64-LABEL: add_8
+; X64-NOT: lock
+; X64: addb
+; X64-NOT: movb
+; X32-LABEL: add_8
+; X32-NOT: lock
+; X32: addb
+; X32-NOT: movb
+ %1 = load atomic i8* %p seq_cst, align 1
+ %2 = add i8 %1, 2
+ store atomic i8 %2, i8* %p release, align 1
+ ret void
+}
+
+define void @add_16(i16* %p) {
+; Currently the transformation is not done on 16 bit accesses, as the backend
+; treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: add_16
+; X64-NOT: addw
+; X32-LABEL: add_16
+; X32-NOT: addw
+ %1 = load atomic i16* %p acquire, align 2
+ %2 = add i16 %1, 2
+ store atomic i16 %2, i16* %p release, align 2
+ ret void
+}
+
+define void @add_32(i32* %p) {
+; X64-LABEL: add_32
+; X64-NOT: lock
+; X64: addl
+; X64-NOT: movl
+; X32-LABEL: add_32
+; X32-NOT: lock
+; X32: addl
+; X32-NOT: movl
+ %1 = load atomic i32* %p acquire, align 4
+ %2 = add i32 %1, 2
+ store atomic i32 %2, i32* %p monotonic, align 4
+ ret void
+}
+
+define void @add_64(i64* %p) {
+; X64-LABEL: add_64
+; X64-NOT: lock
+; X64: addq
+; X64-NOT: movq
+; We do not check X86-32 as it cannot do 'addq'.
+; X32-LABEL: add_64
+ %1 = load atomic i64* %p acquire, align 8
+ %2 = add i64 %1, 2
+ store atomic i64 %2, i64* %p release, align 8
+ ret void
+}
+
+define void @add_32_seq_cst(i32* %p) {
+; X64-LABEL: add_32_seq_cst
+; X64: xchgl
+; X32-LABEL: add_32_seq_cst
+; X32: xchgl
+ %1 = load atomic i32* %p monotonic, align 4
+ %2 = add i32 %1, 2
+ store atomic i32 %2, i32* %p seq_cst, align 4
+ ret void
+}
+
+; ----- AND -----
+
+define void @and_8(i8* %p) {
+; X64-LABEL: and_8
+; X64-NOT: lock
+; X64: andb
+; X64-NOT: movb
+; X32-LABEL: and_8
+; X32-NOT: lock
+; X32: andb
+; X32-NOT: movb
+ %1 = load atomic i8* %p monotonic, align 1
+ %2 = and i8 %1, 2
+ store atomic i8 %2, i8* %p release, align 1
+ ret void
+}
+
+define void @and_16(i16* %p) {
+; Currently the transformation is not done on 16 bit accesses, as the backend
+; treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: and_16
+; X64-NOT: andw
+; X32-LABEL: and_16
+; X32-NOT: andw
+ %1 = load atomic i16* %p acquire, align 2
+ %2 = and i16 %1, 2
+ store atomic i16 %2, i16* %p release, align 2
+ ret void
+}
+
+define void @and_32(i32* %p) {
+; X64-LABEL: and_32
+; X64-NOT: lock
+; X64: andl
+; X64-NOT: movl
+; X32-LABEL: and_32
+; X32-NOT: lock
+; X32: andl
+; X32-NOT: movl
+ %1 = load atomic i32* %p acquire, align 4
+ %2 = and i32 %1, 2
+ store atomic i32 %2, i32* %p release, align 4
+ ret void
+}
+
+define void @and_64(i64* %p) {
+; X64-LABEL: and_64
+; X64-NOT: lock
+; X64: andq
+; X64-NOT: movq
+; We do not check X86-32 as it cannot do 'andq'.
+; X32-LABEL: and_64
+ %1 = load atomic i64* %p acquire, align 8
+ %2 = and i64 %1, 2
+ store atomic i64 %2, i64* %p release, align 8
+ ret void
+}
+
+define void @and_32_seq_cst(i32* %p) {
+; X64-LABEL: and_32_seq_cst
+; X64: xchgl
+; X32-LABEL: and_32_seq_cst
+; X32: xchgl
+ %1 = load atomic i32* %p monotonic, align 4
+ %2 = and i32 %1, 2
+ store atomic i32 %2, i32* %p seq_cst, align 4
+ ret void
+}
+
+; ----- OR -----
+
+define void @or_8(i8* %p) {
+; X64-LABEL: or_8
+; X64-NOT: lock
+; X64: orb
+; X64-NOT: movb
+; X32-LABEL: or_8
+; X32-NOT: lock
+; X32: orb
+; X32-NOT: movb
+ %1 = load atomic i8* %p acquire, align 1
+ %2 = or i8 %1, 2
+ store atomic i8 %2, i8* %p release, align 1
+ ret void
+}
+
+define void @or_16(i16* %p) {
+; X64-LABEL: or_16
+; X64-NOT: orw
+; X32-LABEL: or_16
+; X32-NOT: orw
+ %1 = load atomic i16* %p acquire, align 2
+ %2 = or i16 %1, 2
+ store atomic i16 %2, i16* %p release, align 2
+ ret void
+}
+
+define void @or_32(i32* %p) {
+; X64-LABEL: or_32
+; X64-NOT: lock
+; X64: orl
+; X64-NOT: movl
+; X32-LABEL: or_32
+; X32-NOT: lock
+; X32: orl
+; X32-NOT: movl
+ %1 = load atomic i32* %p acquire, align 4
+ %2 = or i32 %1, 2
+ store atomic i32 %2, i32* %p release, align 4
+ ret void
+}
+
+define void @or_64(i64* %p) {
+; X64-LABEL: or_64
+; X64-NOT: lock
+; X64: orq
+; X64-NOT: movq
+; We do not check X86-32 as it cannot do 'orq'.
+; X32-LABEL: or_64
+ %1 = load atomic i64* %p acquire, align 8
+ %2 = or i64 %1, 2
+ store atomic i64 %2, i64* %p release, align 8
+ ret void
+}
+
+define void @or_32_seq_cst(i32* %p) {
+; X64-LABEL: or_32_seq_cst
+; X64: xchgl
+; X32-LABEL: or_32_seq_cst
+; X32: xchgl
+ %1 = load atomic i32* %p monotonic, align 4
+ %2 = or i32 %1, 2
+ store atomic i32 %2, i32* %p seq_cst, align 4
+ ret void
+}
+
+; ----- XOR -----
+
+define void @xor_8(i8* %p) {
+; X64-LABEL: xor_8
+; X64-NOT: lock
+; X64: xorb
+; X64-NOT: movb
+; X32-LABEL: xor_8
+; X32-NOT: lock
+; X32: xorb
+; X32-NOT: movb
+ %1 = load atomic i8* %p acquire, align 1
+ %2 = xor i8 %1, 2
+ store atomic i8 %2, i8* %p release, align 1
+ ret void
+}
+
+define void @xor_16(i16* %p) {
+; X64-LABEL: xor_16
+; X64-NOT: xorw
+; X32-LABEL: xor_16
+; X32-NOT: xorw
+ %1 = load atomic i16* %p acquire, align 2
+ %2 = xor i16 %1, 2
+ store atomic i16 %2, i16* %p release, align 2
+ ret void
+}
+
+define void @xor_32(i32* %p) {
+; X64-LABEL: xor_32
+; X64-NOT: lock
+; X64: xorl
+; X64-NOT: movl
+; X32-LABEL: xor_32
+; X32-NOT: lock
+; X32: xorl
+; X32-NOT: movl
+ %1 = load atomic i32* %p acquire, align 4
+ %2 = xor i32 %1, 2
+ store atomic i32 %2, i32* %p release, align 4
+ ret void
+}
+
+define void @xor_64(i64* %p) {
+; X64-LABEL: xor_64
+; X64-NOT: lock
+; X64: xorq
+; X64-NOT: movq
+; We do not check X86-32 as it cannot do 'xorq'.
+; X32-LABEL: xor_64
+ %1 = load atomic i64* %p acquire, align 8
+ %2 = xor i64 %1, 2
+ store atomic i64 %2, i64* %p release, align 8
+ ret void
+}
+
+define void @xor_32_seq_cst(i32* %p) {
+; X64-LABEL: xor_32_seq_cst
+; X64: xchgl
+; X32-LABEL: xor_32_seq_cst
+; X32: xchgl
+ %1 = load atomic i32* %p monotonic, align 4
+ %2 = xor i32 %1, 2
+ store atomic i32 %2, i32* %p seq_cst, align 4
+ ret void
+}
+
+; ----- INC -----
+
+define void @inc_8(i8* %p) {
+; X64-LABEL: inc_8
+; X64-NOT: lock
+; X64: incb
+; X64-NOT: movb
+; X32-LABEL: inc_8
+; X32-NOT: lock
+; X32: incb
+; X32-NOT: movb
+; SLOW_INC-LABEL: inc_8
+; SLOW_INC-NOT: incb
+; SLOW_INC-NOT: movb
+ %1 = load atomic i8* %p seq_cst, align 1
+ %2 = add i8 %1, 1
+ store atomic i8 %2, i8* %p release, align 1
+ ret void
+}
+
+define void @inc_16(i16* %p) {
+; Currently the transformation is not done on 16 bit accesses, as the backend
+; treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: inc_16
+; X64-NOT: incw
+; X32-LABEL: inc_16
+; X32-NOT: incw
+; SLOW_INC-LABEL: inc_16
+; SLOW_INC-NOT: incw
+ %1 = load atomic i16* %p acquire, align 2
+ %2 = add i16 %1, 1
+ store atomic i16 %2, i16* %p release, align 2
+ ret void
+}
+
+define void @inc_32(i32* %p) {
+; X64-LABEL: inc_32
+; X64-NOT: lock
+; X64: incl
+; X64-NOT: movl
+; X32-LABEL: inc_32
+; X32-NOT: lock
+; X32: incl
+; X32-NOT: movl
+; SLOW_INC-LABEL: inc_32
+; SLOW_INC-NOT: incl
+; SLOW_INC-NOT: movl
+ %1 = load atomic i32* %p acquire, align 4
+ %2 = add i32 %1, 1
+ store atomic i32 %2, i32* %p monotonic, align 4
+ ret void
+}
+
+define void @inc_64(i64* %p) {
+; X64-LABEL: inc_64
+; X64-NOT: lock
+; X64: incq
+; X64-NOT: movq
+; We do not check X86-32 as it cannot do 'incq'.
+; X32-LABEL: inc_64
+; SLOW_INC-LABEL: inc_64
+; SLOW_INC-NOT: incq
+; SLOW_INC-NOT: movq
+ %1 = load atomic i64* %p acquire, align 8
+ %2 = add i64 %1, 1
+ store atomic i64 %2, i64* %p release, align 8
+ ret void
+}
+
+define void @inc_32_seq_cst(i32* %p) {
+; X64-LABEL: inc_32_seq_cst
+; X64: xchgl
+; X32-LABEL: inc_32_seq_cst
+; X32: xchgl
+ %1 = load atomic i32* %p monotonic, align 4
+ %2 = add i32 %1, 1
+ store atomic i32 %2, i32* %p seq_cst, align 4
+ ret void
+}
+
+; ----- DEC -----
+
+define void @dec_8(i8* %p) {
+; X64-LABEL: dec_8
+; X64-NOT: lock
+; X64: decb
+; X64-NOT: movb
+; X32-LABEL: dec_8
+; X32-NOT: lock
+; X32: decb
+; X32-NOT: movb
+; SLOW_INC-LABEL: dec_8
+; SLOW_INC-NOT: decb
+; SLOW_INC-NOT: movb
+ %1 = load atomic i8* %p seq_cst, align 1
+ %2 = sub i8 %1, 1
+ store atomic i8 %2, i8* %p release, align 1
+ ret void
+}
+
+define void @dec_16(i16* %p) {
+; Currently the transformation is not done on 16 bit accesses, as the backend
+; treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: dec_16
+; X64-NOT: decw
+; X32-LABEL: dec_16
+; X32-NOT: decw
+; SLOW_INC-LABEL: dec_16
+; SLOW_INC-NOT: decw
+ %1 = load atomic i16* %p acquire, align 2
+ %2 = sub i16 %1, 1
+ store atomic i16 %2, i16* %p release, align 2
+ ret void
+}
+
+define void @dec_32(i32* %p) {
+; X64-LABEL: dec_32
+; X64-NOT: lock
+; X64: decl
+; X64-NOT: movl
+; X32-LABEL: dec_32
+; X32-NOT: lock
+; X32: decl
+; X32-NOT: movl
+; SLOW_INC-LABEL: dec_32
+; SLOW_INC-NOT: decl
+; SLOW_INC-NOT: movl
+ %1 = load atomic i32* %p acquire, align 4
+ %2 = sub i32 %1, 1
+ store atomic i32 %2, i32* %p monotonic, align 4
+ ret void
+}
+
+define void @dec_64(i64* %p) {
+; X64-LABEL: dec_64
+; X64-NOT: lock
+; X64: decq
+; X64-NOT: movq
+; We do not check X86-32 as it cannot do 'decq'.
+; X32-LABEL: dec_64
+; SLOW_INC-LABEL: dec_64
+; SLOW_INC-NOT: decq
+; SLOW_INC-NOT: movq
+ %1 = load atomic i64* %p acquire, align 8
+ %2 = sub i64 %1, 1
+ store atomic i64 %2, i64* %p release, align 8
+ ret void
+}
+
+define void @dec_32_seq_cst(i32* %p) {
+; X64-LABEL: dec_32_seq_cst
+; X64: xchgl
+; X32-LABEL: dec_32_seq_cst
+; X32: xchgl
+ %1 = load atomic i32* %p monotonic, align 4
+ %2 = sub i32 %1, 1
+ store atomic i32 %2, i32* %p seq_cst, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 1fd9085..02ea173 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -51,46 +51,6 @@ entry:
ret <4 x i64> %shuffle
}
-;;;
-;;; Check that some 256-bit vectors are xformed into 128 ops
-; CHECK: _A
-; CHECK: vshufpd $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vshufpd $1
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: _B
-; CHECK: vshufpd $1, %ymm
-define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: movlhps
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: movlhps
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: vpshufd $-96
-; CHECK: vpshufd $-6
-; CHECK: vinsertf128 $1
-define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11>
- ret <8 x i32> %shuffle
-}
-
;;; Don't crash on movd
; CHECK: _VMOVZQI2PQI
; CHECK: vmovd (%
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
deleted file mode 100644
index d2a22d7..0000000
--- a/test/CodeGen/X86/avx-blend.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; AVX128 tests:
-
-;CHECK-LABEL: vsel_float:
-; select mask is <i1 true, i1 false, i1 true, i1 false>.
-; Big endian representation is 0101 = 5.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 1010 = 10.
-; According to the ABI:
-; v1 is in xmm0 => first argument is xmm0.
-; v2 is in xmm1 => second argument is xmm1.
-; result is in xmm0 => destination argument.
-;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
- ret <4 x float> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i32:
-;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
-;CHECK: ret
-define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
- ret <4 x i32> %vsel
-}
-
-
-;CHECK-LABEL: vsel_double:
-;CHECK: vmovsd
-;CHECK: ret
-define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
- %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
- ret <2 x double> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i64:
-;CHECK: vmovsd
-;CHECK: ret
-define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
- %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
- ret <2 x i64> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i8:
-;CHECK: vpblendvb
-;CHECK: ret
-define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
- %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
- ret <16 x i8> %vsel
-}
-
-
-; AVX256 tests:
-
-
-;CHECK-LABEL: vsel_float8:
-;CHECK-NOT: vinsertf128
-; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
-; which translates into the boolean mask (big endian representation):
-; 00010001 = 17.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 11101110 = 238.
-;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0
-;CHECK: ret
-define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
- %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
- ret <8 x float> %vsel
-}
-
-;CHECK-LABEL: vsel_i328:
-;CHECK-NOT: vinsertf128
-;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0
-;CHECK-NEXT: ret
-define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
- %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
- ret <8 x i32> %vsel
-}
-
-;CHECK-LABEL: vsel_double8:
-; select mask is 2x: 0001 => intel mask: ~0001 = 14
-; ABI:
-; v1 is in ymm0 and ymm1.
-; v2 is in ymm2 and ymm3.
-; result is in ymm0 and ymm1.
-; Compute the low part: res.low = blend v1.low, v2.low, blendmask
-;CHECK: vblendpd $14, %ymm2, %ymm0, %ymm0
-; Compute the high part.
-;CHECK: vblendpd $14, %ymm3, %ymm1, %ymm1
-;CHECK: ret
-define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
- %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
- ret <8 x double> %vsel
-}
-
-;CHECK-LABEL: vsel_i648:
-;CHECK: vblendpd $14, %ymm2, %ymm0, %ymm0
-;CHECK: vblendpd $14, %ymm3, %ymm1, %ymm1
-;CHECK: ret
-define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
- %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
- ret <8 x i64> %vsel
-}
-
-;CHECK-LABEL: vsel_double4:
-;CHECK-NOT: vinsertf128
-;CHECK: vblendpd $10
-;CHECK-NEXT: ret
-define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
- ret <4 x double> %vsel
-}
-
-;; TEST blend + compares
-; CHECK: testa
-define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
- ; CHECK: vcmplepd
- ; CHECK: vblendvpd
- %max_is_x = fcmp oge <2 x double> %x, %y
- %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
- ret <2 x double> %max
-}
-
-; CHECK: testb
-define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
- ; CHECK: vcmpnlepd
- ; CHECK: vblendvpd
- %min_is_x = fcmp ult <2 x double> %x, %y
- %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
- ret <2 x double> %min
-}
-
-; If we can figure out a blend has a constant mask, we should emit the
-; blend instruction with an immediate mask
-define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
-; CHECK-LABEL: constant_blendvpd_avx:
-; CHECK-NOT: mov
-; CHECK: vblendpd
-; CHECK: ret
- %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
- ret <4 x double> %1
-}
-
-define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
-; CHECK-LABEL: constant_blendvps_avx:
-; CHECK-NOT: mov
-; CHECK: vblendps
-; CHECK: ret
- %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
- ret <8 x float> %1
-}
-
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-;; 4 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_shufflevector_4xfloat
-define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
-; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>.
-; Big endian representation is 0101 = 5.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; Inverted mask: 1010 = 10.
-; According to the ABI:
-; a is in xmm0 => first argument is xmm0.
-; b is in xmm1 => second argument is xmm1.
-; Result is in xmm0 => destination argument.
-; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
-; CHECK: ret
- %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_8xfloat
-define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
-; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0
-; CHECK: ret
- %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
- ret <8 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_4xdouble
-define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
-; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0
-; CHECK: ret
- %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
- ret <4 x double> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_4xi64
-define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0
-; CHECK: ret
- %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
- ret <4 x i64> %1
-}
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 3e051bf..70ec124 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -89,23 +89,23 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
; X64-LABEL: test_prolog_epilog
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill
; X64: call
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
ret <16 x float> %c
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..d2b44cd
--- /dev/null
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+ ; CHECK: vblendpd
+ %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+ ; CHECK: vblendps
+ %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
+ ; CHECK: vdpps
+ %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+
+
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index ce31161..ef3e83f 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -455,21 +455,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
- ; CHECK: vpslldq
- %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
- ; CHECK: vpslldq
- %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
@@ -551,21 +551,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
- ; CHECK: vpsrldq
- %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
- ; CHECK: vpsrldq
- %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
@@ -818,18 +818,18 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK: vblendpd
- %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+ %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
; CHECK: vblendps
- %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
@@ -850,35 +850,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
; CHECK: vdppd
- %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+ %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
; CHECK: vdpps
- %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
; CHECK: vinsertps
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK: vmpsadbw
- %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+ %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
@@ -899,10 +899,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK: vpblendw
- %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+ %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
@@ -1770,18 +1770,18 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK: vblendpd
- %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
+ %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
-declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK: vblendps
- %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+ %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
-declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
@@ -1950,10 +1950,10 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK: vdpps
- %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+ %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
-declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
@@ -2309,7 +2309,7 @@ declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) noun
define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
; CHECK: vpermilpd
- %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
+ %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
@@ -2324,7 +2324,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea
define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
- ; CHECK: vpshufd
+ ; CHECK: vpermilps
%res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/avx-movdup.ll b/test/CodeGen/X86/avx-movdup.ll
deleted file mode 100644
index 42d84de..0000000
--- a/test/CodeGen/X86/avx-movdup.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vmovsldup
-define <8 x float> @movdupA(<8 x float> %src) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
- ret <8 x float> %shuffle.i
-}
-
-; CHECK: vmovshdup
-define <8 x float> @movdupB(<8 x float> %src) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
- ret <8 x float> %shuffle.i
-}
-
-; CHECK: vmovsldup
-define <4 x i64> @movdupC(<4 x i64> %src) nounwind uwtable readnone ssp {
-entry:
- %0 = bitcast <4 x i64> %src to <8 x float>
- %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
- %1 = bitcast <8 x float> %shuffle.i to <4 x i64>
- ret <4 x i64> %1
-}
-
-; CHECK: vmovshdup
-define <4 x i64> @movdupD(<4 x i64> %src) nounwind uwtable readnone ssp {
-entry:
- %0 = bitcast <4 x i64> %src to <8 x float>
- %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
- %1 = bitcast <8 x float> %shuffle.i to <4 x i64>
- ret <4 x i64> %1
-}
-
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
deleted file mode 100644
index fb2287f..0000000
--- a/test/CodeGen/X86/avx-sext.ll
+++ /dev/null
@@ -1,199 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
-
-define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; AVX: sext_8i16_to_8i32
-; AVX: vpmovsxwd
-
- %B = sext <8 x i16> %A to <8 x i32>
- ret <8 x i32>%B
-}
-
-define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; AVX: sext_4i32_to_4i64
-; AVX: vpmovsxdq
-
- %B = sext <4 x i32> %A to <4 x i64>
- ret <4 x i64>%B
-}
-
-; AVX: load_sext_test1
-; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test1
-; SSSE3: movq
-; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
-; SSSE3: psrad $16
-; SSSE3: ret
-
-; SSE2: load_sext_test1
-; SSE2: movq
-; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
-; SSE2: psrad $16
-; SSE2: ret
-define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
- %X = load <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i32>
- ret <4 x i32>%Y
-}
-
-; AVX: load_sext_test2
-; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test2
-; SSSE3: movd
-; SSSE3: pshufb
-; SSSE3: psrad $24
-; SSSE3: ret
-
-; SSE2: load_sext_test2
-; SSE2: movl
-; SSE2: psrad $24
-; SSE2: ret
-define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
- %X = load <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i32>
- ret <4 x i32>%Y
-}
-
-; AVX: load_sext_test3
-; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test3
-; SSSE3: movsbq
-; SSSE3: movsbq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test3
-; SSE2: movsbq
-; SSE2: movsbq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
- %X = load <2 x i8>* %ptr
- %Y = sext <2 x i8> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test4
-; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test4
-; SSSE3: movswq
-; SSSE3: movswq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test4
-; SSE2: movswq
-; SSE2: movswq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
- %X = load <2 x i16>* %ptr
- %Y = sext <2 x i16> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test5
-; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test5
-; SSSE3: movslq
-; SSSE3: movslq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test5
-; SSE2: movslq
-; SSE2: movslq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
- %X = load <2 x i32>* %ptr
- %Y = sext <2 x i32> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test6
-; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test6
-; SSSE3: movq
-; SSSE3: punpcklbw
-; SSSE3: psraw $8
-; SSSE3: ret
-
-; SSE2: load_sext_test6
-; SSE2: movq
-; SSE2: punpcklbw
-; SSE2: psraw $8
-; SSE2: ret
-define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
- %X = load <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i16>
- ret <8 x i16>%Y
-}
-
-; AVX: sext_4i1_to_4i64
-; AVX: vpslld $31
-; AVX: vpsrad $31
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
- %extmask = sext <4 x i1> %mask to <4 x i64>
- ret <4 x i64> %extmask
-}
-
-; AVX-LABEL: sext_16i8_to_16i16
-; AVX: vpmovsxbw
-; AVX: vmovhlps
-; AVX: vpmovsxbw
-; AVX: ret
-define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
- %X = load <16 x i8>* %ptr
- %Y = sext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
-; AVX: sext_4i8_to_4i64
-; AVX: vpslld $24
-; AVX: vpsrad $24
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
- %extmask = sext <4 x i8> %mask to <4 x i64>
- ret <4 x i64> %extmask
-}
-
-; AVX: sext_4i8_to_4i64
-; AVX: vpmovsxbd
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
- %X = load <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i64>
- ret <4 x i64>%Y
-}
-
-; AVX: sext_4i16_to_4i64
-; AVX: vpmovsxwd
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
- %X = load <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i64>
- ret <4 x i64>%Y
-}
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
deleted file mode 100644
index 4a996d7..0000000
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ /dev/null
@@ -1,336 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; PR11102
-define <4 x float> @test1(<4 x float> %a) nounwind {
- %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
- ret <4 x float> %b
-; CHECK-LABEL: test1:
-;; TODO: This test could be improved by removing the xor instruction and
-;; having vinsertps zero out the needed elements.
-; CHECK: vxorps
-; CHECK: vinsertps
-}
-
-; rdar://10538417
-define <3 x i64> @test2(<2 x i64> %v) nounwind readnone {
-; CHECK-LABEL: test2:
-; CHECK: vinsertf128
- %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef>
- %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2>
- ret <3 x i64> %2
-; CHECK: ret
-}
-
-define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind {
- %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef>
- ret <4 x i64> %c
-; CHECK-LABEL: test3:
-; CHECK: vblendpd
-; CHECK: ret
-}
-
-define <8 x float> @test4(float %a) nounwind {
- %b = insertelement <8 x float> zeroinitializer, float %a, i32 0
- ret <8 x float> %b
-; CHECK-LABEL: test4:
-; CHECK: vinsertf128
-}
-
-; rdar://10594409
-define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp {
-entry:
- %0 = bitcast float* %f to <4 x float>*
- %1 = load <4 x float>* %0, align 16
-; CHECK: test5
-; CHECK: vmovaps
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
- %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
- ret <8 x float> %shuffle.i
-}
-
-define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp {
-entry:
- %0 = bitcast double* %d to <2 x double>*
- %1 = load <2 x double>* %0, align 16
-; CHECK: test6
-; CHECK: vmovaps
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
- %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
- ret <4 x double> %shuffle.i
-}
-
-define <16 x i16> @test7(<4 x i16> %a) nounwind {
-; CHECK: test7
- %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK: ret
- ret <16 x i16> %b
-}
-
-; CHECK: test8
-define void @test8() {
-entry:
- %0 = load <16 x i64> addrspace(1)* null, align 128
- %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26>
- %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15>
- store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128
-; CHECK: ret
- ret void
-}
-
-; Extract a value from a shufflevector..
-define i32 @test9(<4 x i32> %a) nounwind {
-; CHECK: test9
-; CHECK: vpextrd
- %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
- %r = extractelement <8 x i32> %b, i32 2
-; CHECK: ret
- ret i32 %r
-}
-
-; Extract a value which is the result of an undef mask.
-define i32 @test10(<4 x i32> %a) nounwind {
-; CHECK: @test10
-; CHECK-NOT: {{^[^#]*[a-z]}}
-; CHECK: ret
- %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %r = extractelement <8 x i32> %b, i32 2
- ret i32 %r
-}
-
-define <4 x float> @test11(<4 x float> %a) nounwind {
-; CHECK: test11
-; CHECK: vpshufd $27
- %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x float> %tmp1
-}
-
-define <4 x float> @test12(<4 x float>* %a) nounwind {
-; CHECK: test12
-; CHECK: vpshufd
- %tmp0 = load <4 x float>* %a
- %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x float> %tmp1
-}
-
-define <4 x i32> @test13(<4 x i32> %a) nounwind {
-; CHECK: test13
-; CHECK: vpshufd $27
- %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test14(<4 x i32>* %a) nounwind {
-; CHECK: test14
-; CHECK: vpshufd $27, (
- %tmp0 = load <4 x i32>* %a
- %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x i32> %tmp1
-}
-
-; CHECK: test15
-; CHECK: vpshufd $8
-; CHECK: ret
-define <4 x i32> @test15(<2 x i32>%x) nounwind readnone {
- %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- ret <4 x i32>%x1
-}
-
-; rdar://10974078
-define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp {
-entry:
- %0 = bitcast float* %f to <4 x float>*
- %1 = load <4 x float>* %0, align 8
-; CHECK: test16
-; CHECK: vmovups
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
- %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
- ret <8 x float> %shuffle.i
-}
-
-; PR12413
-; CHECK: shuf1
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpshufb
-define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
-entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- ret <32 x i8> %0
-}
-
-; handle the case where only half of the 256-bits is splittable
-; CHECK: shuf2
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpextrb
-; CHECK: vpextrb
-define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
-entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- ret <32 x i8> %0
-}
-
-; CHECK: blend1
-; CHECK: vblendps
-; CHECK: ret
-define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
- %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
- ret <4 x i32> %t
-}
-
-; CHECK: blend2
-; CHECK: vblendps
-; CHECK: ret
-define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
- %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x i32> %t
-}
-
-; CHECK: blend2a
-; CHECK: vblendps
-; CHECK: ret
-define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline {
- %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x float> %t
-}
-
-; CHECK: blend3
-; CHECK-NOT: vblendps
-; CHECK: ret
-define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
- %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7>
- ret <4 x i32> %t
-}
-
-; CHECK: blend4
-; CHECK: vblendpd
-; CHECK: ret
-define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
- %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
- ret <4 x i64> %t
-}
-
-; CHECK: narrow
-; CHECK: vpermilps
-; CHECK: ret
-define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
- %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
- ret <16 x i16> %t
-}
-
-;CHECK-LABEL: test17:
-;CHECK-NOT: vinsertf128
-;CHECK: ret
-define <8 x float> @test17(<4 x float> %y) {
- %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x float> %x
-}
-
-; CHECK: test18
-; CHECK: vmovshdup
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
- %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
- ret <8 x float>%S
-}
-
-; CHECK: test19
-; CHECK: vmovsldup
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
- %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
- ret <8 x float>%S
-}
-
-; rdar://12684358
-; Make sure loads happen before stores.
-; CHECK: swap8doubles
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
-; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
-; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
-; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
-; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
-; CHECK: vextractf128
-; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
-; CHECK: vextractf128
-; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
-; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
-define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp {
-entry:
- %add.ptr = getelementptr inbounds double* %A, i64 2
- %v.i = bitcast double* %A to <2 x double>*
- %0 = load <2 x double>* %v.i, align 1
- %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
- %v1.i = bitcast double* %add.ptr to <2 x double>*
- %1 = load <2 x double>* %v1.i, align 1
- %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind
- %add.ptr1 = getelementptr inbounds double* %A, i64 6
- %add.ptr2 = getelementptr inbounds double* %A, i64 4
- %v.i27 = bitcast double* %add.ptr2 to <2 x double>*
- %3 = load <2 x double>* %v.i27, align 1
- %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
- %v1.i29 = bitcast double* %add.ptr1 to <2 x double>*
- %4 = load <2 x double>* %v1.i29, align 1
- %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind
- %6 = bitcast double* %C to <4 x double>*
- %7 = load <4 x double>* %6, align 32
- %add.ptr5 = getelementptr inbounds double* %C, i64 4
- %8 = bitcast double* %add.ptr5 to <4 x double>*
- %9 = load <4 x double>* %8, align 32
- %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1)
- %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1)
- store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16
- store <2 x double> %10, <2 x double>* %v1.i, align 16
- store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16
- store <2 x double> %11, <2 x double>* %v1.i29, align 16
- store <4 x double> %2, <4 x double>* %6, align 32
- store <4 x double> %5, <4 x double>* %8, align 32
- ret void
-}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-; this test case just should not fail
-define void @test20() {
- %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2
- store <3 x double> %a0, <3 x double>* undef, align 1
- %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2
- store <3 x double> %a1, <3 x double>* undef, align 1
- ret void
-}
-
-define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
-; CHECK-LABEL: test_insert_64_zext
-; CHECK-NOT: xor
-; CHECK: vmovq
- %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
- ret <2 x i64> %1
-}
-
-;; Ensure we don't use insertps from non v4x32 vectors.
-;; On SSE4.1 it works because bigger vectors use more than 1 register.
-;; On AVX they get passed in a single register.
-;; FIXME: We could probably optimize this case, if we're only using the
-;; first 4 indices.
-define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
-; CHECK-LABEL: insert_from_diff_size:
-; CHECK-NOT: insertps
-; CHECK: ret
- %vecext = extractelement <8 x i32> %x, i32 0
- %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
- %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
- %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
- %a.0 = extractelement <8 x i32> %x, i32 0
- %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
- ret <4 x i32> %vecinit3
-}
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index b1b2f8b..98c1645 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -1,9 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; CHECK: vpunpcklbw %xmm
-; CHECK-NEXT: vpunpckhbw %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
@@ -11,8 +9,7 @@ entry:
ret <32 x i8> %shuffle
}
-; CHECK: vpunpckhwd %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
; CHECK-NEXT: vinsertf128 $1
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
@@ -21,7 +18,7 @@ entry:
}
; CHECK: vmovq
-; CHECK-NEXT: vmovlhps %xmm
+; CHECK-NEXT: vunpcklpd %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
@@ -32,7 +29,7 @@ entry:
ret <4 x i64> %vecinit6.i
}
-; CHECK: vpermilpd $0
+; CHECK: vunpcklpd %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
@@ -72,7 +69,7 @@ __load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_ex
ret <8 x float> %load_broadcast12281250
}
-; CHECK: vpshufd $0
+; CHECK: vpermilps $4
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
@@ -81,7 +78,7 @@ define <8 x float> @funcF(i32 %val) nounwind {
ret <8 x float> %tmp
}
-; CHECK: vpshufd $0
+; CHECK: vpermilps $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
@@ -90,7 +87,7 @@ entry:
}
; CHECK: vextractf128 $1
-; CHECK-NEXT: vpshufd
+; CHECK-NEXT: vpermilps $85
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll
deleted file mode 100644
index 1c56fe2..0000000
--- a/test/CodeGen/X86/avx-vmovddup.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vmovddup %ymm
-define <4 x i64> @A(<4 x i64> %a) {
- %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
- ret <4 x i64> %c
-}
-
-; CHECK: vmovddup (%
-define <4 x i64> @B(<4 x i64>* %ptr) {
- %a = load <4 x i64>* %ptr
- %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
- ret <4 x i64> %c
-}
diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll
deleted file mode 100644
index c20775b..0000000
--- a/test/CodeGen/X86/avx-vperm2f128.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: _A
-; CHECK: vperm2f128 $1
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
- ret <8 x float> %shuffle
-}
-
-; CHECK: _B
-; CHECK: vblendps $240
-define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
- ret <8 x float> %shuffle
-}
-
-; CHECK: _C
-; CHECK: vperm2f128 $0
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- ret <8 x float> %shuffle
-}
-
-; CHECK: _D
-; CHECK: vperm2f128 $17
-define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- ret <8 x float> %shuffle
-}
-
-; CHECK: _E
-; CHECK: vperm2f128 $17
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
- ret <32 x i8> %shuffle
-}
-
-; CHECK: _E2
-; CHECK: vperm2f128 $3
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
- ret <4 x i64> %shuffle
-}
-
-;;;; Cases with undef indicies mixed in the mask
-
-; CHECK: _F
-; CHECK: vperm2f128 $33
-define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
- ret <8 x float> %shuffle
-}
-
-;;;; Cases we must not select vperm2f128
-
-; CHECK: _G
-; CHECK-NOT: vperm2f128
-define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
- ret <8 x float> %shuffle
-}
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
new file mode 100644
index 0000000..a103405
--- /dev/null
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: A:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: B:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: C:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: D:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: E:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: E2:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: Ei:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: Ei:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: retq
+entry:
+ ; add forces execution domain
+ %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E2i:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: E2i:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX2-NEXT: retq
+entry:
+ ; add forces execution domain
+ %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+ %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E3i:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: E3i:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: retq
+entry:
+ ; add forces execution domain
+ %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i32> %shuffle
+}
+
+define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E4i:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: E4i:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+entry:
+ ; add forces execution domain
+ %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E5i:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps (%rsi), %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: E5i:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa (%rsi), %ymm1
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %c = load <16 x i16>* %a
+ %d = load <16 x i16>* %b
+ %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i16> %shuffle
+}
+
+;;;; Cases with undef indicies mixed in the mask
+
+define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: F:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
+ ret <8 x float> %shuffle
+}
+
+;;;; Cases we must not select vperm2f128
+
+define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: G:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: G:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; AVX2-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
+ ret <8 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll
deleted file mode 100644
index b7f8d72..0000000
--- a/test/CodeGen/X86/avx-vpermil.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vpermilps
-define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
- ret <8 x float> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
- ret <4 x double> %shuffle
-}
-
-; CHECK: vpermilps
-define <8 x i32> @funcC(<8 x i32> %a) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
- ret <8 x i32> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x i64> @funcD(<4 x i64> %a) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x i64> @funcQ(<4 x i64>* %a) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <4 x i64>* %a
- %shuffle = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
- ret <4 x i64> %shuffle
-}
-
-; vpermil should match masks like this: <u,3,1,2,4,u,5,6>. Check that the
-; target specific mask was correctly generated.
-; CHECK: vpermilps $-100
-define <8 x float> @funcE(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 3, i32 1, i32 2, i32 4, i32 8, i32 5, i32 6>
- ret <8 x float> %shuffle
-}
-
-; CHECK: palignr $8
-; CHECK: palignr $8
-define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
- ret <8 x float> %shuffle
-}
diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll
deleted file mode 100644
index ad3dbc1..0000000
--- a/test/CodeGen/X86/avx-vshufp.ll
+++ /dev/null
@@ -1,157 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vshufps $-53, %ymm
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
- ret <8 x float> %shuffle
-}
-
-; CHECK: vshufps $-53, (%{{.*}}), %ymm
-define <8 x float> @A2(<8 x float>* %a, <8 x float>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <8 x float>* %a
- %b2 = load <8 x float>* %b
- %shuffle = shufflevector <8 x float> %a2, <8 x float> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
- ret <8 x float> %shuffle
-}
-
-; CHECK: vshufps $-53, %ymm
-define <8 x i32> @A3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
- ret <8 x i32> %shuffle
-}
-
-; CHECK: vshufps $-53, (%{{.*}}), %ymm
-define <8 x i32> @A4(<8 x i32>* %a, <8 x i32>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <8 x i32>* %a
- %b2 = load <8 x i32>* %b
- %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
- ret <8 x i32> %shuffle
-}
-
-; CHECK: vblendpd $10, %ymm
-define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x double> %shuffle
-}
-
-; CHECK: vblendpd $10, (%{{.*}}), %ymm
-define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <4 x double>* %a
- %b2 = load <4 x double>* %b
- %shuffle = shufflevector <4 x double> %a2, <4 x double> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x double> %shuffle
-}
-
-; CHECK: vblendpd $10, %ymm
-define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: vblendpd $10, (%{{.*}}), %ymm
-define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <4 x i64>* %a
- %b2 = load <4 x i64>* %b
- %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: vshufps $-53, %ymm
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 undef, i32 undef, i32 11, i32 undef, i32 6, i32 12, i32 undef>
- ret <8 x float> %shuffle
-}
-
-; CHECK: vblendpd $2, %ymm
-define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
- ret <4 x double> %shuffle
-}
-
-; CHECK: vshufps $-55, %ymm
-define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 10, i32 0, i32 3, i32 13, i32 14, i32 4, i32 7>
- ret <8 x float> %shuffle
-}
-
-; CHECK: vshufpd $8, %ymm
-define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
- ret <4 x double> %shuffle
-}
-
-; CHECK: vshufps $-53, %xmm
-define <4 x float> @A128(<4 x float> %a, <4 x float> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
- ret <4 x float> %shuffle
-}
-
-; CHECK: vshufps $-53, (%{{.*}}), %xmm
-define <4 x float> @A2128(<4 x float>* %a, <4 x float>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <4 x float>* %a
- %b2 = load <4 x float>* %b
- %shuffle = shufflevector <4 x float> %a2, <4 x float> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
- ret <4 x float> %shuffle
-}
-
-; CHECK: vshufps $-53, %xmm
-define <4 x i32> @A3128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
- ret <4 x i32> %shuffle
-}
-
-; CHECK: vshufps $-53, (%{{.*}}), %xmm
-define <4 x i32> @A4128(<4 x i32>* %a, <4 x i32>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <4 x i32>* %a
- %b2 = load <4 x i32>* %b
- %shuffle = shufflevector <4 x i32> %a2, <4 x i32> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
- ret <4 x i32> %shuffle
-}
-
-; CHECK: vshufpd $1, %xmm
-define <2 x double> @B128(<2 x double> %a, <2 x double> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
- ret <2 x double> %shuffle
-}
-
-; CHECK: vshufpd $1, (%{{.*}}), %xmm
-define <2 x double> @B2128(<2 x double>* %a, <2 x double>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <2 x double>* %a
- %b2 = load <2 x double>* %b
- %shuffle = shufflevector <2 x double> %a2, <2 x double> %b2, <2 x i32> <i32 1, i32 2>
- ret <2 x double> %shuffle
-}
-
-; CHECK: vshufpd $1, %xmm
-define <2 x i64> @B3128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
- ret <2 x i64> %shuffle
-}
-
-; CHECK: vshufpd $1, (%{{.*}}), %xmm
-define <2 x i64> @B4128(<2 x i64>* %a, <2 x i64>* %b) nounwind uwtable readnone ssp {
-entry:
- %a2 = load <2 x i64>* %a
- %b2 = load <2 x i64>* %b
- %shuffle = shufflevector <2 x i64> %a2, <2 x i64> %b2, <2 x i32> <i32 1, i32 2>
- ret <2 x i64> %shuffle
-}
diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll
deleted file mode 100644
index 7511746..0000000
--- a/test/CodeGen/X86/avx-zext.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-;CHECK-LABEL: zext_8i16_to_8i32:
-;CHECK: vpunpckhwd
-;CHECK: ret
-
- %B = zext <8 x i16> %A to <8 x i32>
- ret <8 x i32>%B
-}
-
-define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-;CHECK-LABEL: zext_4i32_to_4i64:
-;CHECK: vpunpckhdq
-;CHECK: ret
-
- %B = zext <4 x i32> %A to <4 x i64>
- ret <4 x i64>%B
-}
-
-define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
-;CHECK-LABEL: zext_8i8_to_8i32:
-;CHECK: vpunpckhwd
-;CHECK: vpmovzxwd
-;CHECK: vinsertf128
-;CHECK: ret
- %t = zext <8 x i8> %z to <8 x i32>
- ret <8 x i32> %t
-}
-
-; PR17654
-define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
-; CHECK-LABEL: zext_16i8_to_16i16:
-; CHECK: vpxor
-; CHECK: vpunpckhbw
-; CHECK: vpunpcklbw
-; CHECK: vinsertf128
-; CHECK: ret
- %t = zext <16 x i8> %z to <16 x i16>
- ret <16 x i16> %t
-}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 6069c14..cba6d98 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
; X32: movl 8(%esp), %ecx
; CHECK-NOT: mov
;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps $-64, 12(%{{...}},%{{...}}), %
; CHECK-NEXT: ret
%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
%2 = load <4 x float>* %1, align 16
diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll
new file mode 100644
index 0000000..2e669b0
--- /dev/null
+++ b/test/CodeGen/X86/avx1-stack-reload-folding.ll
@@ -0,0 +1,68 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests - we use the 'big vectors' pattern to guarantee spilling to stack.
+;
+; Many of these tests are primarily to check memory folding with specific instructions. Using a basic
+; load/cvt/store pattern to test for this would mean that it wouldn't be the memory folding code thats
+; being tested - the load-execute version of the instruction from the tables would be matched instead.
+
+define void @stack_fold_vmulpd(<64 x double>* %a, <64 x double>* %b, <64 x double>* %c) {
+ ;CHECK-LABEL: stack_fold_vmulpd
+ ;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <64 x double>* %a
+ %2 = load <64 x double>* %b
+ %3 = fadd <64 x double> %1, %2
+ %4 = fsub <64 x double> %1, %2
+ %5 = fmul <64 x double> %3, %4
+ store <64 x double> %5, <64 x double>* %c
+ ret void
+}
+
+define void @stack_fold_cvtdq2ps(<128 x i32>* %a, <128 x i32>* %b, <128 x float>* %c) {
+ ;CHECK-LABEL: stack_fold_cvtdq2ps
+ ;CHECK: vcvtdq2ps {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <128 x i32>* %a
+ %2 = load <128 x i32>* %b
+ %3 = and <128 x i32> %1, %2
+ %4 = xor <128 x i32> %1, %2
+ %5 = sitofp <128 x i32> %3 to <128 x float>
+ %6 = sitofp <128 x i32> %4 to <128 x float>
+ %7 = fadd <128 x float> %5, %6
+ store <128 x float> %7, <128 x float>* %c
+ ret void
+}
+
+define void @stack_fold_cvttpd2dq(<64 x double>* %a, <64 x double>* %b, <64 x i32>* %c) #0 {
+ ;CHECK-LABEL: stack_fold_cvttpd2dq
+ ;CHECK: vcvttpd2dqy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <64 x double>* %a
+ %2 = load <64 x double>* %b
+ %3 = fadd <64 x double> %1, %2
+ %4 = fsub <64 x double> %1, %2
+ %5 = fptosi <64 x double> %3 to <64 x i32>
+ %6 = fptosi <64 x double> %4 to <64 x i32>
+ %7 = or <64 x i32> %5, %6
+ store <64 x i32> %7, <64 x i32>* %c
+ ret void
+}
+
+define void @stack_fold_cvttps2dq(<128 x float>* %a, <128 x float>* %b, <128 x i32>* %c) #0 {
+ ;CHECK-LABEL: stack_fold_cvttps2dq
+ ;CHECK: vcvttps2dq {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <128 x float>* %a
+ %2 = load <128 x float>* %b
+ %3 = fadd <128 x float> %1, %2
+ %4 = fsub <128 x float> %1, %2
+ %5 = fptosi <128 x float> %3 to <128 x i32>
+ %6 = fptosi <128 x float> %4 to <128 x i32>
+ %7 = or <128 x i32> %5, %6
+ store <128 x i32> %7, <128 x i32>* %c
+ ret void
+}
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
deleted file mode 100644
index b02442b..0000000
--- a/test/CodeGen/X86/avx2-blend.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
-
-define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
-; CHECK-LABEL: constant_pblendvb_avx2:
-; CHECK: vmovdqa
-; CHECK: vpblendvb
- %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
- ret <32 x i8> %1
-}
-
-declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..ac2c73b
--- /dev/null
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=core-avx2 -mattr=avx2 | FileCheck %s
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+ ; CHECK: vpblendw
+ %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+ ; CHECK: vpblendd
+ %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpblendd
+ %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
+ ; CHECK: vmpsadbw
+ %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index ab3d591..84b22b7 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -158,21 +158,21 @@ define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
- ; CHECK: vpslldq
- %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
- ; CHECK: vpslldq
- %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
+
+
+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+ %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+ %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
@@ -254,21 +254,21 @@ define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
- ; CHECK: vpsrldq
- %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
- ; CHECK: vpsrldq
- %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
@@ -475,10 +475,10 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK: vmpsadbw
- %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+ %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
-declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
@@ -499,10 +499,10 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK: vpblendw
- %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+ %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
-declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
@@ -706,18 +706,18 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK: vpblendd
- %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
+ %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK: vpblendd
- %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
+ %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll
deleted file mode 100644
index 83573dc..0000000
--- a/test/CodeGen/X86/avx2-palignr.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-define <8 x i32> @test1(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: vpalignr $4
- %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
- ret <8 x i32> %C
-}
-
-define <8 x i32> @test2(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: vpalignr $4
- %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 undef, i32 12>
- ret <8 x i32> %C
-}
-
-define <8 x i32> @test3(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: vpalignr $4
- %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
- ret <8 x i32> %C
-}
-;
-define <8 x i32> @test4(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test4:
-; CHECK: vpalignr $8
- %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 10, i32 11, i32 undef, i32 1, i32 14, i32 15, i32 4, i32 5>
- ret <8 x i32> %C
-}
-
-define <16 x i16> @test5(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test5:
-; CHECK: vpalignr $6
- %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 undef, i32 6, i32 7, i32 16, i32 17, i32 18, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26>
- ret <16 x i16> %C
-}
-
-define <16 x i16> @test6(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test6:
-; CHECK: vpalignr $6
- %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26>
- ret <16 x i16> %C
-}
-
-define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test7:
-; CHECK: vpalignr $6
- %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i16> %C
-}
-
-define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind {
-; CHECK-LABEL: test8:
-; CHECK: vpalignr $5
- %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
- ret <32 x i8> %C
-}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
deleted file mode 100644
index 185b989..0000000
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ /dev/null
@@ -1,127 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; Make sure that we don't match this shuffle using the vpblendw YMM instruction.
-; The mask for the vpblendw instruction needs to be identical for both halves
-; of the YMM. Need to use two vpblendw instructions.
-
-; CHECK: vpblendw_test1
-; mask = 10010110,b = 150,d
-; CHECK: vpblendw $150, %ymm
-; CHECK: ret
-define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
- %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 23,
- i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31>
- ret <16 x i16> %t
-}
-
-; CHECK: vpblendw_test2
-; mask1 = 00010110 = 22
-; mask2 = 10000000 = 128
-; CHECK: vpblendw $128, %xmm
-; CHECK: vpblendw $22, %xmm
-; CHECK: vinserti128
-; CHECK: ret
-define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
- %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7,
- i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
- ret <16 x i16> %t
-}
-
-; CHECK: blend_test1
-; CHECK: vpblendd
-; CHECK: ret
-define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
- %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
- ret <8 x i32> %t
-}
-
-; CHECK: blend_test2
-; CHECK: vpblendd
-; CHECK: ret
-define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
- %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
- ret <8 x i32> %t
-}
-
-
-; CHECK: blend_test3
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline {
- %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
- ret <8 x float> %t
-}
-
-; CHECK: blend_test4
-; CHECK: vblendpd
-; CHECK: ret
-define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
- %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
- ret <4 x i64> %t
-}
-
-;; 2 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_test5
-; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0
-; CHECK: ret
-define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) {
- %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x i32> %1
-}
-
-; CHECK-LABEL: @blend_test6
-; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) {
- %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 23,
- i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31>
- ret <16 x i16> %1
-}
-
-; CHECK: vpshufhw $27, %ymm
-define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
- ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpshuflw $27, %ymm
-define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpshufb_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
- %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,
- i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,
- i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25,
- i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
- ret <32 x i8>%S
-}
-
-; CHECK: vpshufb1_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
- %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,
- i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,
- i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25,
- i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
- ret <32 x i8>%S
-}
-
-
-; CHECK: vpshufb2_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
- %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,
- i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,
- i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25,
- i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
- ret <32 x i8>%S
-}
diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll
deleted file mode 100644
index 6d17443..0000000
--- a/test/CodeGen/X86/avx2-unpack.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vpunpckhdq
-define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
- ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpckhqdq
-define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
- ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckldq
-define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
- ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpcklqdq
-define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
- ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckhwd
-define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
- ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpcklwd
-define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
- ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpckhbw
-define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
- ret <32 x i8> %shuffle.i
-}
-
-; CHECK: vpunpcklbw
-define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
- ret <32 x i8> %shuffle.i
-}
-
-; CHECK: vpunpckhdq
-define <8 x i32> @unpackhidq1_undef(<8 x i32> %src1) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
- ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpckhqdq
-define <4 x i64> @unpackhiqdq1_undef(<4 x i64> %src1) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
- ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckhwd
-define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
- ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpcklwd
-define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
- %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
- ret <16 x i16> %shuffle.i
-}
-
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 66f586d..924c06e 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -317,7 +317,7 @@ define <4 x double> @_inreg4xdouble(<4 x double> %a) {
}
;CHECK-LABEL: _inreg2xdouble:
-;CHECK: vpbroadcastq
+;CHECK: vunpcklpd
;CHECK: ret
define <2 x double> @_inreg2xdouble(<2 x double> %a) {
%b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll
deleted file mode 100644
index 1937db5..0000000
--- a/test/CodeGen/X86/avx2-vperm2i128.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vperm2i128 $17
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-entry:
- ; add forces execution domain
- %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
- ret <32 x i8> %shuffle
-}
-
-; CHECK: vperm2i128 $3
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
- ; add forces execution domain
- %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
- %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
- ret <4 x i64> %shuffle
-}
-
-; CHECK: vperm2i128 $49
-define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
- ; add forces execution domain
- %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
- %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
- ret <8 x i32> %shuffle
-}
-
-; CHECK: vperm2i128 $2
-define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
-entry:
- ; add forces execution domain
- %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
- %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <16 x i16> %shuffle
-}
-
-; CHECK: vperm2i128 $2, (%
-define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
-entry:
- %c = load <16 x i16>* %a
- %d = load <16 x i16>* %b
- %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
- %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <16 x i16> %shuffle
-}
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 4d1c9f7..c43da9c 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,189 +1,217 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; CHECK-LABEL: addpd512
-; CHECK: vaddpd
-; CHECK: ret
define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: addpd512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%add.i = fadd <8 x double> %x, %y
ret <8 x double> %add.i
}
-; CHECK-LABEL: addpd512fold
-; CHECK: vaddpd LCP{{.*}}(%rip)
-; CHECK: ret
define <8 x double> @addpd512fold(<8 x double> %y) {
+; CHECK-LABEL: addpd512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
ret <8 x double> %add.i
}
-; CHECK-LABEL: addps512
-; CHECK: vaddps
-; CHECK: ret
define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: addps512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%add.i = fadd <16 x float> %x, %y
ret <16 x float> %add.i
}
-; CHECK-LABEL: addps512fold
-; CHECK: vaddps LCP{{.*}}(%rip)
-; CHECK: ret
define <16 x float> @addps512fold(<16 x float> %y) {
+; CHECK-LABEL: addps512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
ret <16 x float> %add.i
}
-; CHECK-LABEL: subpd512
-; CHECK: vsubpd
-; CHECK: ret
define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: subpd512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vsubpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%sub.i = fsub <8 x double> %x, %y
ret <8 x double> %sub.i
}
-; CHECK-LABEL: @subpd512fold
-; CHECK: vsubpd (%
-; CHECK: ret
define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+; CHECK-LABEL: subpd512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vsubpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%tmp2 = load <8 x double>* %x, align 8
%sub.i = fsub <8 x double> %y, %tmp2
ret <8 x double> %sub.i
}
-; CHECK-LABEL: @subps512
-; CHECK: vsubps
-; CHECK: ret
define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: subps512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vsubps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%sub.i = fsub <16 x float> %x, %y
ret <16 x float> %sub.i
}
-; CHECK-LABEL: subps512fold
-; CHECK: vsubps (%
-; CHECK: ret
define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+; CHECK-LABEL: subps512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vsubps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%tmp2 = load <16 x float>* %x, align 4
%sub.i = fsub <16 x float> %y, %tmp2
ret <16 x float> %sub.i
}
-; CHECK-LABEL: imulq512
-; CHECK: vpmuludq
-; CHECK: vpmuludq
-; CHECK: ret
define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+; CHECK-LABEL: imulq512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm2
+; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm3
+; CHECK-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
+; CHECK-NEXT: vpsllq $32, %zmm3, %zmm3
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; CHECK-NEXT: vpsrlq $32, %zmm1, %zmm1
+; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%z = mul <8 x i64>%x, %y
ret <8 x i64>%z
}
-; CHECK-LABEL: mulpd512
-; CHECK: vmulpd
-; CHECK: ret
define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: mulpd512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmulpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%mul.i = fmul <8 x double> %x, %y
ret <8 x double> %mul.i
}
-; CHECK-LABEL: mulpd512fold
-; CHECK: vmulpd LCP{{.*}}(%rip)
-; CHECK: ret
define <8 x double> @mulpd512fold(<8 x double> %y) {
+; CHECK-LABEL: mulpd512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
ret <8 x double> %mul.i
}
-; CHECK-LABEL: mulps512
-; CHECK: vmulps
-; CHECK: ret
define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: mulps512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%mul.i = fmul <16 x float> %x, %y
ret <16 x float> %mul.i
}
-; CHECK-LABEL: mulps512fold
-; CHECK: vmulps LCP{{.*}}(%rip)
-; CHECK: ret
define <16 x float> @mulps512fold(<16 x float> %y) {
+; CHECK-LABEL: mulps512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
ret <16 x float> %mul.i
}
-; CHECK-LABEL: divpd512
-; CHECK: vdivpd
-; CHECK: ret
define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: divpd512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%div.i = fdiv <8 x double> %x, %y
ret <8 x double> %div.i
}
-; CHECK-LABEL: divpd512fold
-; CHECK: vdivpd LCP{{.*}}(%rip)
-; CHECK: ret
define <8 x double> @divpd512fold(<8 x double> %y) {
+; CHECK-LABEL: divpd512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
ret <8 x double> %div.i
}
-; CHECK-LABEL: divps512
-; CHECK: vdivps
-; CHECK: ret
define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: divps512:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
entry:
%div.i = fdiv <16 x float> %x, %y
ret <16 x float> %div.i
}
-; CHECK-LABEL: divps512fold
-; CHECK: vdivps LCP{{.*}}(%rip)
-; CHECK: ret
define <16 x float> @divps512fold(<16 x float> %y) {
+; CHECK-LABEL: divps512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
ret <16 x float> %div.i
}
-; CHECK-LABEL: vpaddq_test
-; CHECK: vpaddq %zmm
-; CHECK: ret
define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpaddq_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = add <8 x i64> %i, %j
ret <8 x i64> %x
}
-; CHECK-LABEL: vpaddq_fold_test
-; CHECK: vpaddq (%
-; CHECK: ret
define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+; CHECK-LABEL: vpaddq_fold_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
%tmp = load <8 x i64>* %j, align 4
%x = add <8 x i64> %i, %tmp
ret <8 x i64> %x
}
-; CHECK-LABEL: vpaddq_broadcast_test
-; CHECK: vpaddq LCP{{.*}}(%rip){1to8}
-; CHECK: ret
define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+; CHECK-LABEL: vpaddq_broadcast_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
ret <8 x i64> %x
}
-; CHECK-LABEL: vpaddq_broadcast2_test
-; CHECK: vpaddq (%rdi){1to8}
-; CHECK: ret
define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+; CHECK-LABEL: vpaddq_broadcast2_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%tmp = load i64* %j
%j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
%j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
@@ -197,55 +225,67 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
ret <8 x i64> %x
}
-; CHECK-LABEL: vpaddd_test
-; CHECK: vpaddd %zmm
-; CHECK: ret
define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpaddd_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = add <16 x i32> %i, %j
ret <16 x i32> %x
}
-; CHECK-LABEL: vpaddd_fold_test
-; CHECK: vpaddd (%
-; CHECK: ret
define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+; CHECK-LABEL: vpaddd_fold_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
%tmp = load <16 x i32>* %j, align 4
%x = add <16 x i32> %i, %tmp
ret <16 x i32> %x
}
-; CHECK-LABEL: vpaddd_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
-; CHECK: ret
define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+; CHECK-LABEL: vpaddd_broadcast_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <16 x i32> %x
}
-; CHECK-LABEL: vpaddd_mask_test
-; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%x = add <16 x i32> %i, %j
%r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
ret <16 x i32> %r
}
-; CHECK-LABEL: vpaddd_maskz_test
-; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }}
-; CHECK: ret
define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%x = add <16 x i32> %i, %j
%r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %r
}
-; CHECK-LABEL: vpaddd_mask_fold_test
-; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_fold_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%j = load <16 x i32>* %j.ptr
%x = add <16 x i32> %i, %j
@@ -253,20 +293,26 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
ret <16 x i32> %r
}
-; CHECK-LABEL: vpaddd_mask_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_broadcast_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
ret <16 x i32> %r
}
-; CHECK-LABEL: vpaddd_maskz_fold_test
-; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
-; CHECK: ret
define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_fold_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%j = load <16 x i32>* %j.ptr
%x = add <16 x i32> %i, %j
@@ -274,125 +320,141 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
ret <16 x i32> %r
}
-; CHECK-LABEL: vpaddd_maskz_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
-; CHECK: ret
define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_broadcast_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %r
}
-; CHECK-LABEL: vpsubq_test
-; CHECK: vpsubq %zmm
-; CHECK: ret
define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpsubq_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = sub <8 x i64> %i, %j
ret <8 x i64> %x
}
-; CHECK-LABEL: vpsubd_test
-; CHECK: vpsubd
-; CHECK: ret
define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpsubd_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = sub <16 x i32> %i, %j
ret <16 x i32> %x
}
-; CHECK-LABEL: vpmulld_test
-; CHECK: vpmulld %zmm
-; CHECK: ret
define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+; CHECK-LABEL: vpmulld_test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%x = mul <16 x i32> %i, %j
ret <16 x i32> %x
}
-; CHECK-LABEL: sqrtA
-; CHECK: vsqrtss {{.*}} encoding: [0x62
-; CHECK: ret
declare float @sqrtf(float) readnone
define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtA:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%conv1 = tail call float @sqrtf(float %a) nounwind readnone
ret float %conv1
}
-; CHECK-LABEL: sqrtB
-; CHECK: vsqrtsd {{.*}}## encoding: [0x62
-; CHECK: ret
declare double @sqrt(double) readnone
define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtB:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%call = tail call double @sqrt(double %a) nounwind readnone
ret double %call
}
-; CHECK-LABEL: sqrtC
-; CHECK: vsqrtss {{.*}}## encoding: [0x62
-; CHECK: ret
declare float @llvm.sqrt.f32(float)
define float @sqrtC(float %a) nounwind {
+; CHECK-LABEL: sqrtC:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
%b = call float @llvm.sqrt.f32(float %a)
ret float %b
}
-; CHECK-LABEL: sqrtD
-; CHECK: vsqrtps {{.*}}
-; CHECK: ret
declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+; CHECK-LABEL: sqrtD:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsqrtps %zmm0, %zmm0
+; CHECK-NEXT: retq
%b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
ret <16 x float> %b
}
-; CHECK-LABEL: sqrtE
-; CHECK: vsqrtpd {{.*}}
-; CHECK: ret
declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+; CHECK-LABEL: sqrtE:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
+; CHECK-NEXT: retq
%b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
ret <8 x double> %b
}
-; CHECK-LABEL: fadd_broadcast
-; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK: ret
define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+; CHECK-LABEL: fadd_broadcast:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
ret <16 x float> %b
}
-; CHECK-LABEL: addq_broadcast
-; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: addq_broadcast:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
ret <8 x i64> %b
}
-; CHECK-LABEL: orq_broadcast
-; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: orq_broadcast:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
ret <8 x i64> %b
}
-; CHECK-LABEL: andd512fold
-; CHECK: vpandd (%
-; CHECK: ret
define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+; CHECK-LABEL: andd512fold:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpandd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%a = load <16 x i32>* %x, align 4
%b = and <16 x i32> %y, %a
ret <16 x i32> %b
}
-; CHECK-LABEL: andqbrst
-; CHECK: vpandq (%rdi){1to8}, %zmm
-; CHECK: ret
define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+; CHECK-LABEL: andqbrst:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT: retq
entry:
%a = load i64* %ap, align 8
%b = insertelement <8 x i64> undef, i64 %a, i32 0
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index b5a2aa8..9e9ad31 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -1,30 +1,43 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; CHECK-LABEL: test1
-; CHECK: vpxord
-; CHECK: ret
define <16 x i32> @test1(i32* %x) {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7]
+; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%y = load i32* %x, align 4
%res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
ret <16 x i32>%res
}
-; CHECK-LABEL: test2
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
-; CHECK: ret
define <16 x i32> @test2(<16 x i32> %x) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT: retq
%res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
ret <16 x i32>%res
}
-; CHECK-LABEL: test3
-; CHECK: vinsertf128
-; CHECK: vinsertf64x4
-; CHECK: ret
define <16 x float> @test3(<4 x float> %a) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%b = extractelement <4 x float> %a, i32 2
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
%b1 = extractelement <4 x float> %a, i32 0
%c1 = insertelement <16 x float> %c, float %b1, i32 6
ret <16 x float>%c1
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 47e50a9..6e0d185 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -28,10 +28,9 @@ l2:
ret float %c1
}
+; FIXME: Can use vcmpeqss and extract from the mask here in AVX512.
; CHECK-LABEL: test3
-; CHECK: vcmpeqss
-; CHECK: kmov
-; CHECK: ret
+; CHECK: vucomiss {{.*}}encoding: [0x62
define i32 @test3(float %a, float %b) {
%cmp10.i = fcmp oeq float %a, %b
@@ -86,3 +85,17 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
%res = select i1 %tmp5, i32 1, i32 %a3
ret i32 %res
}
+
+; CHECK-LABEL: test9
+; CHECK: testb
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test9(i64 %a) {
+ %b = and i64 %a, 1
+ %cmp10.i = icmp eq i64 %b, 0
+ br i1 %cmp10.i, label %A, label %B
+A:
+ ret i32 6
+B:
+ ret i32 7
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index f5cda96..2b672a7 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -255,3 +255,56 @@ define double @uitofp03(i32 %a) nounwind {
%b = uitofp i32 %a to double
ret double %b
}
+
+; CHECK-LABEL: @sitofp_16i1_float
+; CHECK: vpbroadcastd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
+ %mask = icmp slt <16 x i32> %a, zeroinitializer
+ %1 = sitofp <16 x i1> %mask to <16 x float>
+ ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_16i8_float
+; CHECK: vpmovsxbd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
+ %1 = sitofp <16 x i8> %a to <16 x float>
+ ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_16i16_float
+; CHECK: vpmovsxwd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
+ %1 = sitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_8i16_double
+; CHECK: vpmovsxwd
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
+ %1 = sitofp <8 x i16> %a to <8 x double>
+ ret <8 x double> %1
+}
+
+; CHECK-LABEL: sitofp_8i8_double
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
+ %1 = sitofp <8 x i8> %a to <8 x double>
+ ret <8 x double> %1
+}
+
+
+; CHECK-LABEL: @sitofp_8i1_double
+; CHECK: vpbroadcastq
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
+ %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+ %1 = sitofp <8 x i1> %cmpres to <8 x double>
+ ret <8 x double> %1
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index ce3d759..366d324 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,97 +1,113 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfmadd_ps_z
; CHECK: vfmadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+ %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfmadd_pd_z
; CHECK: vfmadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+ %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd_pd:
+; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+ %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+ ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfmsubps_z
; CHECK: vfmsub213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+ %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfmsubpd_z
; CHECK: vfmsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+ %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfnmadd_ps_z
; CHECK: vfnmadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+ %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfnmadd_pd_z
; CHECK: vfnmadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+ %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfnmsubps_z
; CHECK: vfnmsub213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+ %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfnmsubpd_z
; CHECK: vfnmsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+ %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfmaddsubps_z
; CHECK: vfmaddsub213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+ %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub_ps:
+; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
+ %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+ ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfmaddsubpd_z
; CHECK: vfmaddsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+ %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfmsubaddps_z
; CHECK: vfmsubadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+ %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfmsubaddpd_z
; CHECK: vfmsubadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+ %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index b360c71..eba895e 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s
;CHECK-LABEL: test1:
;CHECK: vinsertps
@@ -12,9 +13,11 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
}
;CHECK-LABEL: test2:
-;CHECK: vinsertf32x4
-;CHECK: vextractf32x4
-;CHECK: vinsertf32x4
+;KNL: vinsertf32x4 $0
+;SKX: vinsertf64x2 $0
+;CHECK: vextractf32x4 $3
+;KNL: vinsertf32x4 $3
+;SKX: vinsertf64x2 $3
;CHECK: ret
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
%rrr = load double* %br
@@ -24,8 +27,8 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
}
;CHECK-LABEL: test3:
-;CHECK: vextractf32x4
-;CHECK: vinsertf32x4
+;CHECK: vextractf32x4 $1
+;CHECK: vinsertf32x4 $0
;CHECK: ret
define <16 x float> @test3(<16 x float> %x) nounwind {
%eee = extractelement <16 x float> %x, i32 4
@@ -34,8 +37,9 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
}
;CHECK-LABEL: test4:
-;CHECK: vextracti32x4
-;CHECK: vinserti32x4
+;CHECK: vextracti32x4 $2
+;KNL: vinserti32x4 $0
+;SKX: vinserti64x2 $0
;CHECK: ret
define <8 x i64> @test4(<8 x i64> %x) nounwind {
%eee = extractelement <8 x i64> %x, i32 4
@@ -186,12 +190,13 @@ define i16 @test16(i1 *%addr, i16 %a) {
;CHECK-LABEL: test17
;CHECK: kshiftlw
;CHECK: kshiftrw
-;CHECK: korw
+;KNL: korw
+;SKX: korb
;CHECK: ret
define i8 @test17(i1 *%addr, i8 %a) {
%x = load i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
- %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10
+ %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
%x2 = bitcast <8 x i1>%x1 to i8
ret i8 %x2
}
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 18cfcfe..691d1fb 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -60,20 +60,6 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
}
declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
-define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
- ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
- %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
- ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
- %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1]
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
define <8 x double> @test7(<8 x double> %a) {
@@ -97,13 +83,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
}
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
- ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
- %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
@@ -111,13 +90,6 @@ define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
}
declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
- ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
- %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
@@ -125,26 +97,19 @@ define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
}
declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
- ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
- %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
; CHECK: vsqrtpd
- %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+ %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1]
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
; CHECK: vsqrtps
- %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+ %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1]
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
; CHECK: vsqrtss {{.*}}encoding: [0x62
@@ -611,3 +576,515 @@ define <8 x i64> @test_vmovntdqa(i8 *%x) {
}
declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
+
+define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_valign_q:
+; CHECK: valignq $2, %zmm1, %zmm0, %zmm0
+ %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: test_mask_valign_q:
+; CHECK: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> %src, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i8, <8 x i64>, i8)
+
+define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_maskz_valign_d:
+; CHECK: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i8 5, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i8, <16 x i32>, i16)
+
+define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
+ ; CHECK-LABEL: test_mask_store_ss
+ ; CHECK: vmovss %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x11,0x07]
+ call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
+
+define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d
+; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d
+; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q
+; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q
+; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_512
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+ %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
+ %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
+ %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
+ %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
+ %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
+ %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
+ %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
+ %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_512
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_512
+; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
+ %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
+ %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
+ %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
+ %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
+ %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
+ %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
+ %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
+ %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_512
+; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_512
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_512
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_512
+; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_512
+; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextractf32x4:
+; CHECK: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+ %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i8 2, <4 x float> %b, i8 %mask)
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i8, <4 x float>, i8)
+
+define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextracti64x4:
+; CHECK: vextracti64x4 $2, %zmm1, %ymm0 {%k1}
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i8 2, <4 x i64> %b, i8 %mask)
+ ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: test_maskz_vextracti32x4:
+; CHECK: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i8 2, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i8, <4 x i32>, i8)
+
+define <4 x double> @test_vextractf64x4(<8 x double> %a) {
+; CHECK-LABEL: test_vextractf64x4:
+; CHECK: vextractf64x4 $2, %zmm0, %ymm0 ##
+ %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i8 2, <4 x double> zeroinitializer, i8 -1)
+ ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i8, <4 x double>, i8)
+
+define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
+ ; CHECK-LABEL: test_x86_avx512_pslli_d
+ ; CHECK: vpslld
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_mask_pslli_d
+ ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d
+ ; CHECK: vpslld $7, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
+ ; CHECK-LABEL: test_x86_avx512_pslli_q
+ ; CHECK: vpsllq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_mask_pslli_q
+ ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q
+ ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
+ ; CHECK-LABEL: test_x86_avx512_psrli_d
+ ; CHECK: vpsrld
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_mask_psrli_d
+ ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d
+ ; CHECK: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
+ ; CHECK-LABEL: test_x86_avx512_psrli_q
+ ; CHECK: vpsrlq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_mask_psrli_q
+ ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q
+ ; CHECK: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
+ ; CHECK-LABEL: test_x86_avx512_psrai_d
+ ; CHECK: vpsrad
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_mask_psrai_d
+ ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d
+ ; CHECK: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
+ ; CHECK-LABEL: test_x86_avx512_psrai_q
+ ; CHECK: vpsraq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_mask_psrai_q
+ ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
+ ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q
+ ; CHECK: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index dd33ffd..35d3348 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
define i16 @mask16(i16 %x) {
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <16 x i1> %m1 to i16
ret i16 %ret
-; CHECK: mask16
-; CHECK: knotw
+; CHECK-LABEL: mask16
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
; CHECK: ret
}
@@ -15,8 +17,38 @@ define i8 @mask8(i8 %x) {
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
ret i8 %ret
-; CHECK: mask8
-; CHECK: knotw
+; CHECK-LABEL: mask8
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
+; CHECK: ret
+}
+
+define void @mask16_mem(i16* %ptr) {
+ %x = load i16* %ptr, align 4
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <16 x i1> %m1 to i16
+ store i16 %ret, i16* %ptr, align 4
+ ret void
+; CHECK-LABEL: mask16_mem
+; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
+; CHECK: ret
+}
+
+define void @mask8_mem(i8* %ptr) {
+ %x = load i8* %ptr, align 4
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <8 x i1> %m1 to i8
+ store i8 %ret, i8* %ptr, align 4
+ ret void
+; CHECK-LABEL: mask8_mem
+; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}}
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
; CHECK: ret
}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 009802f..93875e8 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -153,31 +153,295 @@ define void @test18(i8 * %addr, <8 x i64> %data) {
ret void
}
-; CHECK-LABEL: store_i1_1
-; CHECK: movb
-; CHECK: movb
+; CHECK-LABEL: test19
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test19(i8 * %addr, <16 x i32> %data) {
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ store <16 x i32>%data, <16 x i32>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test20
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test20(i8 * %addr, <16 x i32> %data) {
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ store <16 x i32>%data, <16 x i32>* %vaddr, align 64
+ ret void
+}
+
+; CHECK-LABEL: test21
+; CHECK: vmovdqa64
; CHECK: ret
-define void @store_i1_1() {
- store i1 true, i1 addrspace(3)* undef, align 128
- store i1 false, i1 addrspace(2)* undef, align 128
+define <8 x i64> @test21(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %res = load <8 x i64>* %vaddr, align 64
+ ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test22
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test22(i8 * %addr, <8 x i64> %data) {
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ store <8 x i64>%data, <8 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: store_i1_2
-; CHECK: movb
+; CHECK-LABEL: test23
+; CHECK: vmovdqu64
; CHECK: ret
-define void @store_i1_2(i64 %a, i64 %b) {
- %res = icmp eq i64 %a, %b
- store i1 %res, i1 addrspace(3)* undef, align 128
+define <8 x i64> @test23(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %res = load <8 x i64>* %vaddr, align 1
+ ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test24
+; CHECK: vmovapd
+; CHECK: ret
+define void @test24(i8 * %addr, <8 x double> %data) {
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ store <8 x double>%data, <8 x double>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: store_i1_3
-; CHECK: kmovw
+; CHECK-LABEL: test25
+; CHECK: vmovapd
+; CHECK: ret
+define <8 x double> @test25(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %res = load <8 x double>* %vaddr, align 64
+ ret <8 x double>%res
+}
+
+; CHECK-LABEL: test26
+; CHECK: vmovaps
; CHECK: ret
-define void @store_i1_3(i16 %a) {
- %a_vec = bitcast i16 %a to <16 x i1>
- %res = extractelement <16 x i1> %a_vec, i32 4
- store i1 %res, i1 addrspace(3)* undef, align 128
+define void @test26(i8 * %addr, <16 x float> %data) {
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ store <16 x float>%data, <16 x float>* %vaddr, align 64
ret void
}
+
+; CHECK-LABEL: test27
+; CHECK: vmovaps
+; CHECK: ret
+define <16 x float> @test27(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %res = load <16 x float>* %vaddr, align 64
+ ret <16 x float>%res
+}
+
+; CHECK-LABEL: test28
+; CHECK: vmovupd
+; CHECK: ret
+define void @test28(i8 * %addr, <8 x double> %data) {
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ store <8 x double>%data, <8 x double>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test29
+; CHECK: vmovupd
+; CHECK: ret
+define <8 x double> @test29(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %res = load <8 x double>* %vaddr, align 1
+ ret <8 x double>%res
+}
+
+; CHECK-LABEL: test30
+; CHECK: vmovups
+; CHECK: ret
+define void @test30(i8 * %addr, <16 x float> %data) {
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ store <16 x float>%data, <16 x float>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test31
+; CHECK: vmovups
+; CHECK: ret
+define <16 x float> @test31(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %res = load <16 x float>* %vaddr, align 1
+ ret <16 x float>%res
+}
+
+; CHECK-LABEL: test32
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+ ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test33
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+ ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test34
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+ ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test35
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+ ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test36
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+ ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test37
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+ ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test38
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+ ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test39
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+ ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test40
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+ ret <16 x float>%res
+}
+
+; CHECK-LABEL: test41
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+ ret <16 x float>%res
+}
+
+; CHECK-LABEL: test42
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+ ret <16 x float>%res
+}
+
+; CHECK-LABEL: test43
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+ ret <16 x float>%res
+}
+
+; CHECK-LABEL: test44
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+ ret <8 x double>%res
+}
+
+; CHECK-LABEL: test45
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+ ret <8 x double>%res
+}
+
+; CHECK-LABEL: test46
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+ ret <8 x double>%res
+}
+
+; CHECK-LABEL: test47
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) {
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+ ret <8 x double>%res
+}
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 83f4698..0dbf286 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -39,3 +39,56 @@ define double @select03(double %a, double %b, double %c, double %eps) {
%cond = select i1 %cmp, double %c, double %b
ret double %cond
}
+
+; CHECK-LABEL: @select04
+; CHECK: vmovaps %zmm3, %zmm1
+; CHECK-NEXT: ret
+; PR20677
+define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
+ %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
+ ret <16 x double> %sel
+}
+
+; CHECK-LABEL: select05
+; CHECK: kmovw %esi, %k0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select05(i8 %a.0, i8 %m) {
+ %mask = bitcast i8 %m to <8 x i1>
+ %a = bitcast i8 %a.0 to <8 x i1>
+ %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
+
+; CHECK-LABEL: select06
+; CHECK: kmovw %esi, %k0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kandw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select06(i8 %a.0, i8 %m) {
+ %mask = bitcast i8 %m to <8 x i1>
+ %a = bitcast i8 %a.0 to <8 x i1>
+ %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
+
+; CHECK-LABEL: select07
+; CHECK-DAG: kmovw %edx, %k0
+; CHECK-DAG: kmovw %edi, %k1
+; CHECK-DAG: kmovw %esi, %k2
+; CHECK: kandw %k0, %k1, %k1
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kandw %k0, %k2, %k0
+; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
+ %mask = bitcast i8 %m to <8 x i1>
+ %a = bitcast i8 %a.0 to <8 x i1>
+ %b = bitcast i8 %b.0 to <8 x i1>
+ %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> %b
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
deleted file mode 100644
index b99e89a..0000000
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ /dev/null
@@ -1,314 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
-; CHECK: LCP
-; CHECK: .long 2
-; CHECK: .long 5
-; CHECK: .long 0
-; CHECK: .long 0
-; CHECK: .long 7
-; CHECK: .long 0
-; CHECK: .long 10
-; CHECK: .long 1
-; CHECK: .long 0
-; CHECK: .long 5
-; CHECK: .long 0
-; CHECK: .long 4
-; CHECK: .long 7
-; CHECK: .long 0
-; CHECK: .long 10
-; CHECK: .long 1
-; CHECK-LABEL: test1:
-; CHECK: vpermps
-; CHECK: ret
-define <16 x float> @test1(<16 x float> %a) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
- ret <16 x float> %c
-}
-
-; CHECK-LABEL: test2:
-; CHECK: vpermd
-; CHECK: ret
-define <16 x i32> @test2(<16 x i32> %a) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test3:
-; CHECK: vpermq
-; CHECK: ret
-define <8 x i64> @test3(<8 x i64> %a) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test4:
-; CHECK: vpermpd
-; CHECK: ret
-define <8 x double> @test4(<8 x double> %a) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test5:
-; CHECK: vpermt2pd
-; CHECK: ret
-define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- ret <8 x double> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test5m:
-; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}}
-define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- %m = bitcast i8 %mask to <8 x i1>
- %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer
- ret <8 x double> %res
-}
-
-; CHECK-LABEL: test6:
-; CHECK: vpermq $30
-; CHECK: ret
-define <8 x i64> @test6(<8 x i64> %a) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test7:
-; CHECK: vpermt2q
-; CHECK: ret
-define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- ret <8 x i64> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test7m:
-; CHECK: vpermt2q {{.* {%k[1-7]} {z}}}
-define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- %m = bitcast i8 %mask to <8 x i1>
- %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
- ret <8 x i64> %res
-}
-
-; The mem variant of vpermt2 with a writemask
-; CHECK-LABEL: test7mm:
-; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}}
-define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind {
- %b = load <8 x i64>* %pb
- %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- %m = bitcast i8 %mask to <8 x i1>
- %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
- ret <8 x i64> %res
-}
-
-; CHECK-LABEL: test8:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x i32> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test8m:
-; CHECK: vpermt2d {{.* {%k[1-7]} {z}}}
-define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- %m = bitcast i16 %mask to <16 x i1>
- %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
- ret <16 x i32> %res
-}
-
-; The mem variant of vpermt2 with a writemask
-; CHECK-LABEL: test8mm:
-; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}}
-define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind {
- %b = load <16 x i32> * %pb
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- %m = bitcast i16 %mask to <16 x i1>
- %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: test9:
-; CHECK: vpermt2ps
-; CHECK: ret
-define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x float> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test9m:
-; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z}
-define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- %m = bitcast i16 %mask to <16 x i1>
- %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer
- ret <16 x float> %res
-}
-
-; CHECK-LABEL: test10:
-; CHECK: vpermt2ps (
-; CHECK: ret
-define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
- %c = load <16 x float>* %b
- %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x float> %d
-}
-
-; CHECK-LABEL: test11:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
- %c = load <16 x i32>* %b
- %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x i32> %d
-}
-
-; CHECK-LABEL: test12
-; CHECK: vmovlhps {{.*}}## encoding: [0x62
-; CHECK: ret
-define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
- %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
- ret <4 x i32> %c
-}
-
-; CHECK-LABEL: test13
-; CHECK: vpermilps $-79, %zmm
-; CHECK: ret
-define <16 x float> @test13(<16 x float> %a) {
- %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test14
-; CHECK: vpermilpd $-53, %zmm
-; CHECK: ret
-define <8 x double> @test14(<8 x double> %a) {
- %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
- ret <8 x double> %b
-}
-
-; CHECK-LABEL: test15
-; CHECK: vpshufd $-79, %zmm
-; CHECK: ret
-define <16 x i32> @test15(<16 x i32> %a) {
- %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i32> %b
-}
-; CHECK-LABEL: test16
-; CHECK: valignq $2, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test17
-; CHECK: vshufpd $19, %zmm1, %zmm0
-; CHECK: ret
-define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test18
-; CHECK: vpunpckhdq %zmm
-; CHECK: ret
-define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test19
-; CHECK: vpunpckldq %zmm
-; CHECK: ret
-define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test20
-; CHECK: vpunpckhqdq %zmm
-; CHECK: ret
-define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
- %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15>
- ret <8 x i64> %b
-}
-
-; CHECK-LABEL: test21
-; CHECK: vunpcklps %zmm
-; CHECK: ret
-define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test22
-; CHECK: vmovhlps {{.*}}## encoding: [0x62
-; CHECK: ret
-define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
- %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
- ret <4 x i32> %c
-}
-
-; CHECK-LABEL: @test23
-; CHECK: vshufps $-112, %zmm
-; CHECK: ret
-define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: @test24
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test25
-; CHECK: vshufps $52
-; CHECK: ret
-define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test26
-; CHECK: vmovshdup
-; CHECK: ret
-define <16 x i32> @test26(<16 x i32> %a) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test27
-; CHECK: ret
-define <16 x i32> @test27(<4 x i32>%a) {
- %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: @test28
-; CHECK: vinserti64x4 $1
-; CHECK: ret
-define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
- %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
- i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: @test29
-; CHECK: vinserti64x4 $0
-; CHECK: ret
-define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
- %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
- i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i32> %res
-}
-
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
index 5e097be..91ef5d5 100644
--- a/test/CodeGen/X86/avx512-trunc-ext.ll
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
; CHECK-LABEL: trunc_16x32_to_16x8
; CHECK: vpmovdb
@@ -118,6 +119,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; CHECK-LABEL: sext_8i1_8i32
; CHECK: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX: vpmovm2d
; CHECK: ret
define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
%x = icmp slt <8 x i32> %a1, %a2
@@ -135,9 +137,8 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) {
}
; CHECK-LABEL: trunc_i32_to_i1
-; CHECK: andl
-; CHECK: kmov
-; CHECK: kortest
+; CHECK: testb
+; CHECK: setne
; CKECK: orl
; CHECK: ret
define i16 @trunc_i32_to_i1(i32 %a) {
@@ -146,3 +147,30 @@ define i16 @trunc_i32_to_i1(i32 %a) {
%res = bitcast <16 x i1> %maskv to i16
ret i16 %res
}
+
+; CHECK-LABEL: sext_8i1_8i16
+; SKX: vpmovm2w
+; CHECK: ret
+define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+ %x = icmp slt <8 x i32> %a1, %a2
+ %y = sext <8 x i1> %x to <8 x i16>
+ ret <8 x i16> %y
+}
+
+; CHECK-LABEL: sext_16i1_16i32
+; SKX: vpmovm2d
+; CHECK: ret
+define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+ %x = icmp slt <16 x i32> %a1, %a2
+ %y = sext <16 x i1> %x to <16 x i32>
+ ret <16 x i32> %y
+}
+
+; CHECK-LABEL: sext_8i1_8i64
+; SKX: vpmovm2q
+; CHECK: ret
+define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+ %x = icmp slt <8 x i32> %a1, %a2
+ %y = sext <8 x i1> %x to <8 x i64>
+ ret <8 x i64> %y
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 9c6db11..0b0e0fc 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,59 +1,72 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-;CHECK-LABEL: _inreg16xi32:
-;CHECK: vpbroadcastd {{.*}}, %zmm
-;CHECK: ret
define <16 x i32> @_inreg16xi32(i32 %a) {
+; CHECK-LABEL: _inreg16xi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0
+; CHECK-NEXT: retq
%b = insertelement <16 x i32> undef, i32 %a, i32 0
%c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
ret <16 x i32> %c
}
-;CHECK-LABEL: _inreg8xi64:
-;CHECK: vpbroadcastq {{.*}}, %zmm
-;CHECK: ret
define <8 x i64> @_inreg8xi64(i64 %a) {
+; CHECK-LABEL: _inreg8xi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT: retq
%b = insertelement <8 x i64> undef, i64 %a, i32 0
%c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
ret <8 x i64> %c
}
-;CHECK-LABEL: _inreg16xfloat:
-;CHECK: vbroadcastss {{.*}}, %zmm
-;CHECK: ret
define <16 x float> @_inreg16xfloat(float %a) {
+; CHECK-LABEL: _inreg16xfloat:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT: retq
%b = insertelement <16 x float> undef, float %a, i32 0
%c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
ret <16 x float> %c
}
-;CHECK-LABEL: _inreg8xdouble:
-;CHECK: vbroadcastsd {{.*}}, %zmm
-;CHECK: ret
define <8 x double> @_inreg8xdouble(double %a) {
+; CHECK-LABEL: _inreg8xdouble:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: retq
%b = insertelement <8 x double> undef, double %a, i32 0
%c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
ret <8 x double> %c
}
-;CHECK-LABEL: _xmm16xi32
-;CHECK: vpbroadcastd
-;CHECK: ret
define <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+; CHECK-LABEL: _xmm16xi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT: retq
%b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
ret <16 x i32> %b
}
-;CHECK-LABEL: _xmm16xfloat
-;CHECK: vbroadcastss {{.*}}## encoding: [0x62
-;CHECK: ret
define <16 x float> @_xmm16xfloat(<16 x float> %a) {
+; CHECK-LABEL: _xmm16xfloat:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT: retq
%b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
ret <16 x float> %b
}
define <16 x i32> @test_vbroadcast() {
- ; CHECK: vpbroadcastd
+; CHECK-LABEL: test_vbroadcast:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vcmpunordps %zmm0, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT: knotw %k1, %k1
+; CHECK-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32>
%1 = fcmp uno <16 x float> undef, zeroinitializer
@@ -62,3 +75,108 @@ entry:
ret <16 x i32> %3
}
+; We implement the set1 intrinsics with vector initializers. Verify that the
+; IR generated will produce broadcasts at the end.
+define <8 x double> @test_set1_pd(double %d) #2 {
+; CHECK-LABEL: test_set1_pd:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %vecinit.i = insertelement <8 x double> undef, double %d, i32 0
+ %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1
+ %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2
+ %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3
+ %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4
+ %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5
+ %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6
+ %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7
+ ret <8 x double> %vecinit7.i
+}
+
+define <8 x i64> @test_set1_epi64(i64 %d) #2 {
+; CHECK-LABEL: test_set1_epi64:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0
+ %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1
+ %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2
+ %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3
+ %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4
+ %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5
+ %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6
+ %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7
+ ret <8 x i64> %vecinit7.i
+}
+
+define <16 x float> @test_set1_ps(float %f) #2 {
+; CHECK-LABEL: test_set1_ps:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %vecinit.i = insertelement <16 x float> undef, float %f, i32 0
+ %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1
+ %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2
+ %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3
+ %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4
+ %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5
+ %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6
+ %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7
+ %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8
+ %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9
+ %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10
+ %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11
+ %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12
+ %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13
+ %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14
+ %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15
+ ret <16 x float> %vecinit15.i
+}
+
+define <16 x i32> @test_set1_epi32(i32 %f) #2 {
+; CHECK-LABEL: test_set1_epi32:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0
+ %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1
+ %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2
+ %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3
+ %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4
+ %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5
+ %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6
+ %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7
+ %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8
+ %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9
+ %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10
+ %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11
+ %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12
+ %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13
+ %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14
+ %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15
+ ret <16 x i32> %vecinit15.i
+}
+
+; We implement the scalar broadcast intrinsics with vector initializers.
+; Verify that the IR generated will produce the broadcast at the end.
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
+; CHECK-LABEL: test_mm512_broadcastsd_pd:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = extractelement <2 x double> %a, i32 0
+ %vecinit.i = insertelement <8 x double> undef, double %0, i32 0
+ %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1
+ %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2
+ %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3
+ %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4
+ %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5
+ %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6
+ %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7
+ ret <8 x double> %vecinit7.i
+}
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index d762f00..c71e60e 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1,145 +1,176 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; CHECK-LABEL: test1
-; CHECK: vcmpleps
-; CHECK: vmovups
-; CHECK: ret
define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
}
-; CHECK-LABEL: test2
-; CHECK: vcmplepd
-; CHECK: vmovupd
-; CHECK: ret
define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
}
-; CHECK-LABEL: test3
-; CHECK: vpcmpeqd (%rdi)
-; CHECK: vmovdqu32
-; CHECK: ret
define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%y = load <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
ret <16 x i32> %max
}
-; CHECK-LABEL: @test4_unsigned
-; CHECK: vpcmpnltud
-; CHECK: vmovdqu32
-; CHECK: ret
define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: test4_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
ret <16 x i32> %max
}
-; CHECK-LABEL: test5
-; CHECK: vpcmpeqq {{.*}}%k1
-; CHECK: vmovdqu64 {{.*}}%k1
-; CHECK: ret
define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
}
-; CHECK-LABEL: test6_unsigned
-; CHECK: vpcmpnleuq {{.*}}%k1
-; CHECK: vmovdqu64 {{.*}}%k1
-; CHECK: ret
define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
+; CHECK-LABEL: test6_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
}
-; CHECK-LABEL: test7
-; CHECK: xor
-; CHECK: vcmpltps
-; CHECK: vblendvps
-; CHECK: ret
define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
%mask = fcmp olt <4 x float> %a, zeroinitializer
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
ret <4 x float>%c
}
-; CHECK-LABEL: test8
-; CHECK: xor
-; CHECK: vcmpltpd
-; CHECK: vblendvpd
-; CHECK: ret
define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
%mask = fcmp olt <2 x double> %a, zeroinitializer
%c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
ret <2 x double>%c
}
-; CHECK-LABEL: test9
-; CHECK: vpcmpeqd
-; CHECK: vpblendmd
-; CHECK: ret
define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def>
+; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def>
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
+; CHECK-NEXT: retq
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
}
-; CHECK-LABEL: test10
-; CHECK: vcmpeqps
-; CHECK: vblendmps
-; CHECK: ret
define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def>
+; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def>
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
+; CHECK-NEXT: retq
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
ret <8 x float> %max
}
-; CHECK-LABEL: test11_unsigned
-; CHECK: vpmaxud
-; CHECK: ret
define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: test11_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
}
-; CHECK-LABEL: test12
-; CHECK: vpcmpeqq %zmm2, %zmm0, [[LO:%k[0-7]]]
-; CHECK: vpcmpeqq %zmm3, %zmm1, [[HI:%k[0-7]]]
-; CHECK: kunpckbw [[LO]], [[HI]], {{%k[0-7]}}
define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: kunpckbw %k0, %k1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: AX<def> AX<kill> EAX<kill>
+; CHECK-NEXT: retq
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
}
-; CHECK-LABEL: test13
-; CHECK: vcmpeqps %zmm
-; CHECK: vpbroadcastd
-; CHECK: ret
define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
{
%cmpvector_i = fcmp oeq <16 x float> %a, %b
%conv = zext <16 x i1> %cmpvector_i to <16 x i32>
ret <16 x i32> %conv
}
-; CHECK-LABEL: test14
-; CHECK: vpcmp
-; CHECK-NOT: vpcmp
-; CHECK: vmovdqu32 {{.*}}{%k1} {z}
-; CHECK: ret
define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
+; CHECK-LABEL: test14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: knotw %k0, %k1
+; CHECK-NEXT: vmovdqu32 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%sub_r = sub <16 x i32> %a, %b
%cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
%sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
@@ -148,12 +179,15 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test15
-; CHECK: vpcmpgtq
-; CHECK-NOT: vpcmp
-; CHECK: vmovdqu64 {{.*}}{%k1} {z}
-; CHECK: ret
define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
+; CHECK-LABEL: test15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: knotw %k0, %k1
+; CHECK-NEXT: vmovdqu64 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%sub_r = sub <8 x i64> %a, %b
%cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
%sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
@@ -162,3 +196,181 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
ret <8 x i64>%res
}
+define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: test16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %mask = icmp sge <16 x i32> %x, %y
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+ ret <16 x i32> %max
+}
+
+define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %y = load <16 x i32>* %y.ptr, align 4
+ %mask = icmp sgt <16 x i32> %x, %y
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+ ret <16 x i32> %max
+}
+
+define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %y = load <16 x i32>* %y.ptr, align 4
+ %mask = icmp sle <16 x i32> %x, %y
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+ ret <16 x i32> %max
+}
+
+define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %y = load <16 x i32>* %y.ptr, align 4
+ %mask = icmp ule <16 x i32> %x, %y
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+ ret <16 x i32> %max
+}
+
+define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %mask1 = icmp eq <16 x i32> %x1, %y1
+ %mask0 = icmp eq <16 x i32> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+ ret <16 x i32> %max
+}
+
+define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %mask1 = icmp sge <8 x i64> %x1, %y1
+ %mask0 = icmp sle <8 x i64> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+ ret <8 x i64> %max
+}
+
+define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %mask1 = icmp sgt <8 x i64> %x1, %y1
+ %y = load <8 x i64>* %y.ptr, align 4
+ %mask0 = icmp sgt <8 x i64> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+ ret <8 x i64> %max
+}
+
+define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %mask1 = icmp sge <16 x i32> %x1, %y1
+ %y = load <16 x i32>* %y.ptr, align 4
+ %mask0 = icmp ule <16 x i32> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+ ret <16 x i32> %max
+}
+
+define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %yb = load i64* %yb.ptr, align 4
+ %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
+ %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
+ %mask = icmp eq <8 x i64> %x, %y
+ %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+ ret <8 x i64> %max
+}
+
+define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
+; CHECK-LABEL: test25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %yb = load i32* %yb.ptr, align 4
+ %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
+ %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
+ %mask = icmp sle <16 x i32> %x, %y
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+ ret <16 x i32> %max
+}
+
+define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %mask1 = icmp sge <16 x i32> %x1, %y1
+ %yb = load i32* %yb.ptr, align 4
+ %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
+ %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
+ %mask0 = icmp sgt <16 x i32> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+ ret <16 x i32> %max
+}
+
+define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %mask1 = icmp sge <8 x i64> %x1, %y1
+ %yb = load i64* %yb.ptr, align 4
+ %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
+ %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
+ %mask0 = icmp sle <8 x i64> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+ ret <8 x i64> %max
+}
diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll
deleted file mode 100644
index 07ded13..0000000
--- a/test/CodeGen/X86/avx512-zext-load-crash.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-
-define <8 x i16> @test_zext_load() {
- ; CHECK: vmovq
-entry:
- %0 = load <2 x i16> ** undef, align 8
- %1 = getelementptr inbounds <2 x i16>* %0, i64 1
- %2 = load <2 x i16>* %0, align 1
- %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %4 = load <2 x i16>* %1, align 1
- %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %6
-}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
new file mode 100644
index 0000000..bbc418c
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s
+
+define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+ %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+ ret i64 %res
+}
+
+define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b
+; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
+ %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+ ret i64 %res
+}
+
+define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b
+; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w
+; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w
+; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
+
+define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_512
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+ %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+ %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
+ %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+ %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
+ %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+ %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
+ %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+ %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
+ %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+ %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
+ %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+ %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
+ %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+ %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
+ %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+ %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+ ret <8 x i64> %vec7
+}
+
+define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_512
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+ %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+ %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+ %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+ %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+ %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+ %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+ %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+ %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+ ret <8 x i64> %vec7
+}
+
+declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+
+define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_512
+; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
+ %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+ %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
+ %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+ %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
+ %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+ %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
+ %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+ %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
+ %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+ %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
+ %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+ %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
+ %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+ %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
+ %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+ %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+ ret <8 x i64> %vec7
+}
+
+define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_512
+; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+ %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+ %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+ %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+ %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+ %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+ %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+ %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+ %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+ ret <8 x i64> %vec7
+}
+
+declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+
+define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_512
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+ %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
+ %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
+ %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
+ %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
+ %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
+ %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
+ %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
+ %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_512
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
+
+define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_512
+; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
+ %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
+ %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
+ %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
+ %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
+ %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
+ %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
+ %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
+ %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_512
+; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
+ %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
+ %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
+ %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
+ %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
+ %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
+ %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
+ %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
+ %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll
new file mode 100644
index 0000000..9d7630c
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+define i32 @mask32(i32 %x) {
+ %m0 = bitcast i32 %x to <32 x i1>
+ %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <32 x i1> %m1 to i32
+ ret i32 %ret
+; CHECK-LABEL: mask32
+; CHECK: kmovd
+; CHECK-NEXT: knotd
+; CHECK-NEXT: kmovd
+; CHECK_NEXT: ret
+}
+
+define i64 @mask64(i64 %x) {
+ %m0 = bitcast i64 %x to <64 x i1>
+ %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <64 x i1> %m1 to i64
+ ret i64 %ret
+; CHECK-LABEL: mask64
+; CHECK: kmovq
+; CHECK-NEXT: knotq
+; CHECK-NEXT: kmovq
+; CHECK_NEXT: ret
+}
+
+define void @mask32_mem(i32* %ptr) {
+ %x = load i32* %ptr, align 4
+ %m0 = bitcast i32 %x to <32 x i1>
+ %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <32 x i1> %m1 to i32
+ store i32 %ret, i32* %ptr, align 4
+ ret void
+; CHECK-LABEL: mask32_mem
+; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotd
+; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]])
+; CHECK_NEXT: ret
+}
+
+define void @mask64_mem(i64* %ptr) {
+ %x = load i64* %ptr, align 4
+ %m0 = bitcast i64 %x to <64 x i1>
+ %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+ i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <64 x i1> %m1 to i64
+ store i64 %ret, i64* %ptr, align 4
+ ret void
+; CHECK-LABEL: mask64_mem
+; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}}
+; CHECK-NEXT: knotq
+; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]])
+; CHECK_NEXT: ret
+}
+
+define i32 @mand32(i32 %x, i32 %y) {
+ %ma = bitcast i32 %x to <32 x i1>
+ %mb = bitcast i32 %y to <32 x i1>
+ %mc = and <32 x i1> %ma, %mb
+ %md = xor <32 x i1> %ma, %mb
+ %me = or <32 x i1> %mc, %md
+ %ret = bitcast <32 x i1> %me to i32
+; CHECK: kandd
+; CHECK: kxord
+; CHECK: kord
+ ret i32 %ret
+}
+
+define i64 @mand64(i64 %x, i64 %y) {
+ %ma = bitcast i64 %x to <64 x i1>
+ %mb = bitcast i64 %y to <64 x i1>
+ %mc = and <64 x i1> %ma, %mb
+ %md = xor <64 x i1> %ma, %mb
+ %me = or <64 x i1> %mc, %md
+ %ret = bitcast <64 x i1> %me to i64
+; CHECK: kandq
+; CHECK: kxorq
+; CHECK: korq
+ ret i64 %ret
+}
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
new file mode 100644
index 0000000..2ff6d28
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test1(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <64 x i8>*
+ %res = load <64 x i8>* %vaddr, align 1
+ ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test2
+; CHECK: vmovdqu8
+; CHECK: ret
+define void @test2(i8 * %addr, <64 x i8> %data) {
+ %vaddr = bitcast i8* %addr to <64 x i8>*
+ store <64 x i8>%data, <64 x i8>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test3
+; CHECK: vmovdqu8{{.*{%k[1-7]}}}
+; CHECK: ret
+define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
+ %mask = icmp ne <64 x i8> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <64 x i8>*
+ %r = load <64 x i8>* %vaddr, align 1
+ %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old
+ ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
+ %mask = icmp ne <64 x i8> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <64 x i8>*
+ %r = load <64 x i8>* %vaddr, align 1
+ %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer
+ ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test5
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test5(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <32 x i16>*
+ %res = load <32 x i16>* %vaddr, align 1
+ ret <32 x i16>%res
+}
+
+; CHECK-LABEL: test6
+; CHECK: vmovdqu16
+; CHECK: ret
+define void @test6(i8 * %addr, <32 x i16> %data) {
+ %vaddr = bitcast i8* %addr to <32 x i16>*
+ store <32 x i16>%data, <32 x i16>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test7
+; CHECK: vmovdqu16{{.*{%k[1-7]}}}
+; CHECK: ret
+define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
+ %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <32 x i16>*
+ %r = load <32 x i16>* %vaddr, align 1
+ %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old
+ ret <32 x i16>%res
+}
+
+; CHECK-LABEL: test8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
+ %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <32 x i16>*
+ %r = load <32 x i16>* %vaddr, align 1
+ %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer
+ ret <32 x i16>%res
+}
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
new file mode 100644
index 0000000..d2b1724
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
+ %mask = icmp eq <64 x i8> %x, %y
+ %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+ ret <64 x i8> %max
+}
+
+; CHECK-LABEL: test2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y) nounwind {
+ %mask = icmp sgt <64 x i8> %x, %y
+ %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+ ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind {
+ %mask = icmp sge <32 x i16> %x, %y
+ %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
+ ret <32 x i16> %max
+}
+
+; CHECK-LABEL: test4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y) nounwind {
+ %mask = icmp ugt <64 x i8> %x, %y
+ %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+ ret <64 x i8> %max
+}
+
+; CHECK-LABEL: test5
+; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind {
+ %y = load <32 x i16>* %yp, align 4
+ %mask = icmp eq <32 x i16> %x, %y
+ %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+ ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+ %y = load <32 x i16>* %y.ptr, align 4
+ %mask = icmp sgt <32 x i16> %x, %y
+ %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+ ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+ %y = load <32 x i16>* %y.ptr, align 4
+ %mask = icmp sle <32 x i16> %x, %y
+ %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+ ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+ %y = load <32 x i16>* %y.ptr, align 4
+ %mask = icmp ule <32 x i16> %x, %y
+ %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+ ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test9
+; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+ %mask1 = icmp eq <32 x i16> %x1, %y1
+ %mask0 = icmp eq <32 x i16> %x, %y
+ %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+ %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %y
+ ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test10
+; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+ %mask1 = icmp sge <64 x i8> %x1, %y1
+ %mask0 = icmp sle <64 x i8> %x, %y
+ %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
+ %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1
+ ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+ %mask1 = icmp sgt <64 x i8> %x1, %y1
+ %y = load <64 x i8>* %y.ptr, align 4
+ %mask0 = icmp sgt <64 x i8> %x, %y
+ %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
+ %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1
+ ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+ %mask1 = icmp sge <32 x i16> %x1, %y1
+ %y = load <32 x i16>* %y.ptr, align 4
+ %mask0 = icmp ule <32 x i16> %x, %y
+ %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+ %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+ ret <32 x i16> %max
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
new file mode 100644
index 0000000..45f8d6d
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -0,0 +1,613 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; 256-bit
+
+define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_256
+; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_256
+; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_256
+; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_256
+; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
+
+define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+ %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
+ %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
+ %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
+ %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
+ %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
+ %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
+ %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
+ %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+
+define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_256
+; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
+ %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
+ %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
+ %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
+ %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
+ %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
+ %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
+ %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
+ %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_256
+; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+ %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+ %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+ %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+ %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+ %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+ %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+ %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+ %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+ ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+
+define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+ %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
+ %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
+ %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
+ %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
+ %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
+ %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
+ %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
+ %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_256
+; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
+ %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
+ %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
+ %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
+ %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
+ %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
+ %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
+ %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
+ %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_256
+; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+
+; 128-bit
+
+define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
+
+define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_128
+; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_128
+; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_128
+; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_128
+; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+ %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
+ %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
+ %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
+ %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
+ %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
+ %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
+ %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
+ %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_128
+; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
+ %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
+ %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
+ %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
+ %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
+ %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
+ %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
+ %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
+ %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_128
+; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+ %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+ %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+ %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+ %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+ %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+ %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+ %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+ %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+ ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_128
+; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_128
+; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
new file mode 100644
index 0000000..835844f
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -0,0 +1,162 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; CHECK-LABEL: test_256_1
+; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_1(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <32 x i8>*
+ %res = load <32 x i8>* %vaddr, align 1
+ ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_2
+; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_2(i8 * %addr, <32 x i8> %data) {
+ %vaddr = bitcast i8* %addr to <32 x i8>*
+ store <32 x i8>%data, <32 x i8>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_256_3
+; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
+ %mask = icmp ne <32 x i8> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <32 x i8>*
+ %r = load <32 x i8>* %vaddr, align 1
+ %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> %old
+ ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
+ %mask = icmp ne <32 x i8> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <32 x i8>*
+ %r = load <32 x i8>* %vaddr, align 1
+ %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> zeroinitializer
+ ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_5
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_5(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <16 x i16>*
+ %res = load <16 x i16>* %vaddr, align 1
+ ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_256_6
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_6(i8 * %addr, <16 x i16> %data) {
+ %vaddr = bitcast i8* %addr to <16 x i16>*
+ store <16 x i16>%data, <16 x i16>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_256_7
+; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
+ %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i16>*
+ %r = load <16 x i16>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> %old
+ ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_256_8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
+ %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i16>*
+ %r = load <16 x i16>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> zeroinitializer
+ ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_128_1
+; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_1(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <16 x i8>*
+ %res = load <16 x i8>* %vaddr, align 1
+ ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_2
+; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_2(i8 * %addr, <16 x i8> %data) {
+ %vaddr = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8>%data, <16 x i8>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_128_3
+; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
+ %mask = icmp ne <16 x i8> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i8>*
+ %r = load <16 x i8>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> %old
+ ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
+ %mask = icmp ne <16 x i8> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i8>*
+ %r = load <16 x i8>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> zeroinitializer
+ ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_5
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_5(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x i16>*
+ %res = load <8 x i16>* %vaddr, align 1
+ ret <8 x i16>%res
+}
+
+; CHECK-LABEL: test_128_6
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_6(i8 * %addr, <8 x i16> %data) {
+ %vaddr = bitcast i8* %addr to <8 x i16>*
+ store <8 x i16>%data, <8 x i16>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_128_7
+; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
+ %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i16>*
+ %r = load <8 x i16>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> %old
+ ret <8 x i16>%res
+}
+
+; CHECK-LABEL: test_128_8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
+ %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i16>*
+ %r = load <8 x i16>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> zeroinitializer
+ ret <8 x i16>%res
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
new file mode 100644
index 0000000..2d13a16
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -0,0 +1,269 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test256_1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
+ %mask = icmp eq <32 x i8> %x, %y
+ %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
+ ret <32 x i8> %max
+}
+
+; CHECK-LABEL: test256_2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+ %mask = icmp sgt <32 x i8> %x, %y
+ %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+ ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind {
+ %mask = icmp sge <16 x i16> %x, %y
+ %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: test256_4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+ %mask = icmp ugt <32 x i8> %x, %y
+ %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+ ret <32 x i8> %max
+}
+
+; CHECK-LABEL: test256_5
+; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind {
+ %y = load <16 x i16>* %yp, align 4
+ %mask = icmp eq <16 x i16> %x, %y
+ %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+ %y = load <16 x i16>* %y.ptr, align 4
+ %mask = icmp sgt <16 x i16> %x, %y
+ %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+ %y = load <16 x i16>* %y.ptr, align 4
+ %mask = icmp sle <16 x i16> %x, %y
+ %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+ %y = load <16 x i16>* %y.ptr, align 4
+ %mask = icmp ule <16 x i16> %x, %y
+ %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_9
+; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+ %mask1 = icmp eq <16 x i16> %x1, %y1
+ %mask0 = icmp eq <16 x i16> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_10
+; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+ %mask1 = icmp sge <32 x i8> %x1, %y1
+ %mask0 = icmp sle <32 x i8> %x, %y
+ %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+ %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+ ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+ %mask1 = icmp sgt <32 x i8> %x1, %y1
+ %y = load <32 x i8>* %y.ptr, align 4
+ %mask0 = icmp sgt <32 x i8> %x, %y
+ %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+ %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+ ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+ %mask1 = icmp sge <16 x i16> %x1, %y1
+ %y = load <16 x i16>* %y.ptr, align 4
+ %mask0 = icmp ule <16 x i16> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+ ret <16 x i16> %max
+}
+
+; CHECK-LABEL: test128_1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
+ %mask = icmp eq <16 x i8> %x, %y
+ %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
+ ret <16 x i8> %max
+}
+
+; CHECK-LABEL: test128_2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+ %mask = icmp sgt <16 x i8> %x, %y
+ %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+ ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind {
+ %mask = icmp sge <8 x i16> %x, %y
+ %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
+ ret <8 x i16> %max
+}
+
+; CHECK-LABEL: test128_4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+ %mask = icmp ugt <16 x i8> %x, %y
+ %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+ ret <16 x i8> %max
+}
+
+; CHECK-LABEL: test128_5
+; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind {
+ %y = load <8 x i16>* %yp, align 4
+ %mask = icmp eq <8 x i16> %x, %y
+ %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+ ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+ %y = load <8 x i16>* %y.ptr, align 4
+ %mask = icmp sgt <8 x i16> %x, %y
+ %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+ ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+ %y = load <8 x i16>* %y.ptr, align 4
+ %mask = icmp sle <8 x i16> %x, %y
+ %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+ ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+ %y = load <8 x i16>* %y.ptr, align 4
+ %mask = icmp ule <8 x i16> %x, %y
+ %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+ ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_9
+; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+ %mask1 = icmp eq <8 x i16> %x1, %y1
+ %mask0 = icmp eq <8 x i16> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+ ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_10
+; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+ %mask1 = icmp sge <16 x i8> %x1, %y1
+ %mask0 = icmp sle <16 x i8> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+ ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+ %mask1 = icmp sgt <16 x i8> %x1, %y1
+ %y = load <16 x i8>* %y.ptr, align 4
+ %mask0 = icmp sgt <16 x i8> %x, %y
+ %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+ %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+ ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+ %mask1 = icmp sge <8 x i16> %x1, %y1
+ %y = load <8 x i16>* %y.ptr, align 4
+ %mask0 = icmp ule <8 x i16> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+ ret <8 x i16> %max
+}
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
new file mode 100644
index 0000000..32a2633
--- /dev/null
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+define i8 @mask8(i8 %x) {
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <8 x i1> %m1 to i8
+ ret i8 %ret
+; CHECK: mask8
+; CHECK: knotb
+; CHECK: ret
+}
+
+define void @mask8_mem(i8* %ptr) {
+ %x = load i8* %ptr, align 4
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <8 x i1> %m1 to i8
+ store i8 %ret, i8* %ptr, align 4
+ ret void
+; CHECK-LABEL: mask8_mem
+; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotb
+; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
+; CHECK: ret
+}
+
+define i8 @mand8(i8 %x, i8 %y) {
+ %ma = bitcast i8 %x to <8 x i1>
+ %mb = bitcast i8 %y to <8 x i1>
+ %mc = and <8 x i1> %ma, %mb
+ %md = xor <8 x i1> %ma, %mb
+ %me = or <8 x i1> %mc, %md
+ %ret = bitcast <8 x i1> %me to i8
+; CHECK: kandb
+; CHECK: kxorb
+; CHECK: korb
+ ret i8 %ret
+}
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
new file mode 100644
index 0000000..0000ece
--- /dev/null
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
+
+define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
+ ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+ %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
+ ; CHECK: kmovw
+ ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+ %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
+ ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+ %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
+ ; CHECK: kmovw
+ ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+ %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
+ ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
+ %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8)
+ ret <16 x float> %res
+}
+
+
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
+ ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+ %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
+ ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+ %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
+ ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
+ ; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+ %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
+ ; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+ %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
+ ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
+ ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
+ ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+ %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
new file mode 100644
index 0000000..fa19084
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -0,0 +1,613 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; 256-bit
+
+define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_256
+; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_256
+; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_256
+; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_256
+; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_256
+; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_256
+; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_256
+; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_256
+; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+
+; 128-bit
+
+define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_128
+; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_128
+; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_128
+; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_128
+; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_128
+; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_128
+; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_128
+; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_128
+; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
+ %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+ %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
+ %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+ %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
+ %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+ %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
+ %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+ %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
+ %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+ %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
+ %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+ %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
+ %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+ %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
+ %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+ %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+ ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
new file mode 100644
index 0000000..3224656
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -0,0 +1,642 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; CHECK-LABEL: test_256_1
+; CHECK: vmovdqu32
+; CHECK: ret
+define <8 x i32> @test_256_1(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ %res = load <8 x i32>* %vaddr, align 1
+ ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_2
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test_256_2(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ %res = load <8 x i32>* %vaddr, align 32
+ ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_3
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test_256_3(i8 * %addr, <4 x i64> %data) {
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ store <4 x i64>%data, <4 x i64>* %vaddr, align 32
+ ret void
+}
+
+; CHECK-LABEL: test_256_4
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test_256_4(i8 * %addr, <8 x i32> %data) {
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ store <8 x i32>%data, <8 x i32>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_256_5
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test_256_5(i8 * %addr, <8 x i32> %data) {
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ store <8 x i32>%data, <8 x i32>* %vaddr, align 32
+ ret void
+}
+
+; CHECK-LABEL: test_256_6
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test_256_6(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ %res = load <4 x i64>* %vaddr, align 32
+ ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_7
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test_256_7(i8 * %addr, <4 x i64> %data) {
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ store <4 x i64>%data, <4 x i64>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_256_8
+; CHECK: vmovdqu64
+; CHECK: ret
+define <4 x i64> @test_256_8(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ %res = load <4 x i64>* %vaddr, align 1
+ ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_9
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_9(i8 * %addr, <4 x double> %data) {
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ store <4 x double>%data, <4 x double>* %vaddr, align 32
+ ret void
+}
+
+; CHECK-LABEL: test_256_10
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x double> @test_256_10(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ %res = load <4 x double>* %vaddr, align 32
+ ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_11
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_11(i8 * %addr, <8 x float> %data) {
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ store <8 x float>%data, <8 x float>* %vaddr, align 32
+ ret void
+}
+
+; CHECK-LABEL: test_256_12
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x float> @test_256_12(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ %res = load <8 x float>* %vaddr, align 32
+ ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_13
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_13(i8 * %addr, <4 x double> %data) {
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ store <4 x double>%data, <4 x double>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_256_14
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x double> @test_256_14(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ %res = load <4 x double>* %vaddr, align 1
+ ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_15
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_15(i8 * %addr, <8 x float> %data) {
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ store <8 x float>%data, <8 x float>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_256_16
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x float> @test_256_16(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ %res = load <8 x float>* %vaddr, align 1
+ ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_17
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ %r = load <8 x i32>* %vaddr, align 32
+ %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old
+ ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_18
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ %r = load <8 x i32>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old
+ ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_19
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ %r = load <8 x i32>* %vaddr, align 32
+ %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer
+ ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_20
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i32>*
+ %r = load <8 x i32>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer
+ ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_21
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ %r = load <4 x i64>* %vaddr, align 32
+ %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old
+ ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_22
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ %r = load <4 x i64>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old
+ ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_23
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ %r = load <4 x i64>* %vaddr, align 32
+ %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer
+ ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_24
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i64>*
+ %r = load <4 x i64>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer
+ ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_25
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+ %mask = fcmp one <8 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ %r = load <8 x float>* %vaddr, align 32
+ %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old
+ ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_26
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+ %mask = fcmp one <8 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ %r = load <8 x float>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old
+ ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_27
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
+ %mask = fcmp one <8 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ %r = load <8 x float>* %vaddr, align 32
+ %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer
+ ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_28
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
+ %mask = fcmp one <8 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x float>*
+ %r = load <8 x float>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer
+ ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_29
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ %r = load <4 x double>* %vaddr, align 32
+ %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old
+ ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_30
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ %r = load <4 x double>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old
+ ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_31
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ %r = load <4 x double>* %vaddr, align 32
+ %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer
+ ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_32
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
+ %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x double>*
+ %r = load <4 x double>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer
+ ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_128_1
+; CHECK: vmovdqu32
+; CHECK: ret
+define <4 x i32> @test_128_1(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ %res = load <4 x i32>* %vaddr, align 1
+ ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_2
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test_128_2(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ %res = load <4 x i32>* %vaddr, align 16
+ ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_3
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test_128_3(i8 * %addr, <2 x i64> %data) {
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ store <2 x i64>%data, <2 x i64>* %vaddr, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_128_4
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test_128_4(i8 * %addr, <4 x i32> %data) {
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ store <4 x i32>%data, <4 x i32>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_128_5
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test_128_5(i8 * %addr, <4 x i32> %data) {
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ store <4 x i32>%data, <4 x i32>* %vaddr, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_128_6
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test_128_6(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ %res = load <2 x i64>* %vaddr, align 16
+ ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_7
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test_128_7(i8 * %addr, <2 x i64> %data) {
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ store <2 x i64>%data, <2 x i64>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_128_8
+; CHECK: vmovdqu64
+; CHECK: ret
+define <2 x i64> @test_128_8(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ %res = load <2 x i64>* %vaddr, align 1
+ ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_9
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_9(i8 * %addr, <2 x double> %data) {
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ store <2 x double>%data, <2 x double>* %vaddr, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_128_10
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <2 x double> @test_128_10(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ %res = load <2 x double>* %vaddr, align 16
+ ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_11
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_11(i8 * %addr, <4 x float> %data) {
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ store <4 x float>%data, <4 x float>* %vaddr, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_128_12
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x float> @test_128_12(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ %res = load <4 x float>* %vaddr, align 16
+ ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_13
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_13(i8 * %addr, <2 x double> %data) {
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ store <2 x double>%data, <2 x double>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_128_14
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <2 x double> @test_128_14(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ %res = load <2 x double>* %vaddr, align 1
+ ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_15
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_15(i8 * %addr, <4 x float> %data) {
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ store <4 x float>%data, <4 x float>* %vaddr, align 1
+ ret void
+}
+
+; CHECK-LABEL: test_128_16
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x float> @test_128_16(i8 * %addr) {
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ %res = load <4 x float>* %vaddr, align 1
+ ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_17
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ %r = load <4 x i32>* %vaddr, align 16
+ %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old
+ ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_18
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ %r = load <4 x i32>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old
+ ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_19
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ %r = load <4 x i32>* %vaddr, align 16
+ %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer
+ ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_20
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x i32>*
+ %r = load <4 x i32>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer
+ ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_21
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ %r = load <2 x i64>* %vaddr, align 16
+ %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old
+ ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_22
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ %r = load <2 x i64>* %vaddr, align 1
+ %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old
+ ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_23
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ %r = load <2 x i64>* %vaddr, align 16
+ %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer
+ ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_24
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x i64>*
+ %r = load <2 x i64>* %vaddr, align 1
+ %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer
+ ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_25
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ %r = load <4 x float>* %vaddr, align 16
+ %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old
+ ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_26
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ %r = load <4 x float>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old
+ ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_27
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ %r = load <4 x float>* %vaddr, align 16
+ %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer
+ ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_28
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <4 x float>*
+ %r = load <4 x float>* %vaddr, align 1
+ %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer
+ ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_29
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ %r = load <2 x double>* %vaddr, align 16
+ %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old
+ ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_30
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ %r = load <2 x double>* %vaddr, align 1
+ %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old
+ ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_31
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ %r = load <2 x double>* %vaddr, align 16
+ %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer
+ ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_32
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <2 x double>*
+ %r = load <2 x double>* %vaddr, align 1
+ %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer
+ ret <2 x double>%res
+}
+
diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll
new file mode 100644
index 0000000..2ad9768
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-nontemporal.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
+
+define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
+; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62
+ %cast = bitcast i8* %B to <8 x float>*
+ %A2 = fadd <8 x float> %A, %AA
+ store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62
+ %cast1 = bitcast i8* %B to <4 x i64>*
+ %E2 = add <4 x i64> %E, %EE
+ store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62
+ %cast2 = bitcast i8* %B to <4 x double>*
+ %C2 = fadd <4 x double> %C, %CC
+ store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
+ ret void
+}
+
+define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
+; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62
+ %cast = bitcast i8* %B to <4 x float>*
+ %A2 = fadd <4 x float> %A, %AA
+ store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62
+ %cast1 = bitcast i8* %B to <2 x i64>*
+ %E2 = add <2 x i64> %E, %EE
+ store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62
+ %cast2 = bitcast i8* %B to <2 x double>*
+ %C2 = fadd <2 x double> %C, %CC
+ store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
+ ret void
+}
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
new file mode 100644
index 0000000..9c64c03
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -0,0 +1,381 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test256_1
+; CHECK: vpcmpeqq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
+ %mask = icmp eq <4 x i64> %x, %y
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_2
+; CHECK: vpcmpgtq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y) nounwind {
+ %mask = icmp sgt <4 x i64> %x, %y
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_3
+; CHECK: vpcmpled {{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
+ %mask = icmp sge <8 x i32> %x, %y
+ %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_4
+; CHECK: vpcmpnleuq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y) nounwind {
+ %mask = icmp ugt <4 x i64> %x, %y
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_5
+; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
+ %y = load <8 x i32>* %yp, align 4
+ %mask = icmp eq <8 x i32> %x, %y
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_6
+; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+ %y = load <8 x i32>* %y.ptr, align 4
+ %mask = icmp sgt <8 x i32> %x, %y
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_7
+; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+ %y = load <8 x i32>* %y.ptr, align 4
+ %mask = icmp sle <8 x i32> %x, %y
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_8
+; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+ %y = load <8 x i32>* %y.ptr, align 4
+ %mask = icmp ule <8 x i32> %x, %y
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_9
+; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+ %mask1 = icmp eq <8 x i32> %x1, %y1
+ %mask0 = icmp eq <8 x i32> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_10
+; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+ %mask1 = icmp sge <4 x i64> %x1, %y1
+ %mask0 = icmp sle <4 x i64> %x, %y
+ %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_11
+; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+ %mask1 = icmp sgt <4 x i64> %x1, %y1
+ %y = load <4 x i64>* %y.ptr, align 4
+ %mask0 = icmp sgt <4 x i64> %x, %y
+ %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_12
+; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+ %mask1 = icmp sge <8 x i32> %x1, %y1
+ %y = load <8 x i32>* %y.ptr, align 4
+ %mask0 = icmp ule <8 x i32> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_13
+; CHECK: vpcmpeqq (%rdi){1to4}, %ymm
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
+ %yb = load i64* %yb.ptr, align 4
+ %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
+ %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
+ %mask = icmp eq <4 x i64> %x, %y
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_14
+; CHECK: vpcmpled (%rdi){1to8}, %ymm
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
+ %yb = load i32* %yb.ptr, align 4
+ %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
+ %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
+ %mask = icmp sle <8 x i32> %x, %y
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_15
+; CHECK: vpcmpgtd (%rdi){1to8}, %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+ %mask1 = icmp sge <8 x i32> %x1, %y1
+ %yb = load i32* %yb.ptr, align 4
+ %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
+ %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
+ %mask0 = icmp sgt <8 x i32> %x, %y
+ %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+ %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+ ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_16
+; CHECK: vpcmpgtq (%rdi){1to4}, %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+ %mask1 = icmp sge <4 x i64> %x1, %y1
+ %yb = load i64* %yb.ptr, align 4
+ %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
+ %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
+ %mask0 = icmp sgt <4 x i64> %x, %y
+ %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+ %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+ ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test128_1
+; CHECK: vpcmpeqq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
+ %mask = icmp eq <2 x i64> %x, %y
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+ ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_2
+; CHECK: vpcmpgtq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y) nounwind {
+ %mask = icmp sgt <2 x i64> %x, %y
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+ ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_3
+; CHECK: vpcmpled {{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
+ %mask = icmp sge <4 x i32> %x, %y
+ %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_4
+; CHECK: vpcmpnleuq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y) nounwind {
+ %mask = icmp ugt <2 x i64> %x, %y
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+ ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_5
+; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
+ %y = load <4 x i32>* %yp, align 4
+ %mask = icmp eq <4 x i32> %x, %y
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_6
+; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+ %y = load <4 x i32>* %y.ptr, align 4
+ %mask = icmp sgt <4 x i32> %x, %y
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_7
+; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+ %y = load <4 x i32>* %y.ptr, align 4
+ %mask = icmp sle <4 x i32> %x, %y
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_8
+; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+ %y = load <4 x i32>* %y.ptr, align 4
+ %mask = icmp ule <4 x i32> %x, %y
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_9
+; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+ %mask1 = icmp eq <4 x i32> %x1, %y1
+ %mask0 = icmp eq <4 x i32> %x, %y
+ %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_10
+; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+ %mask1 = icmp sge <2 x i64> %x1, %y1
+ %mask0 = icmp sle <2 x i64> %x, %y
+ %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+ ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_11
+; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+ %mask1 = icmp sgt <2 x i64> %x1, %y1
+ %y = load <2 x i64>* %y.ptr, align 4
+ %mask0 = icmp sgt <2 x i64> %x, %y
+ %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+ ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_12
+; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+ %mask1 = icmp sge <4 x i32> %x1, %y1
+ %y = load <4 x i32>* %y.ptr, align 4
+ %mask0 = icmp ule <4 x i32> %x, %y
+ %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_13
+; CHECK: vpcmpeqq (%rdi){1to2}, %xmm
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
+ %yb = load i64* %yb.ptr, align 4
+ %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
+ %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
+ %mask = icmp eq <2 x i64> %x, %y
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+ ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_14
+; CHECK: vpcmpled (%rdi){1to4}, %xmm
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
+ %yb = load i32* %yb.ptr, align 4
+ %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
+ %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %mask = icmp sle <4 x i32> %x, %y
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_15
+; CHECK: vpcmpgtd (%rdi){1to4}, %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+ %mask1 = icmp sge <4 x i32> %x1, %y1
+ %yb = load i32* %yb.ptr, align 4
+ %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
+ %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %mask0 = icmp sgt <4 x i32> %x, %y
+ %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+ %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+ ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_16
+; CHECK: vpcmpgtq (%rdi){1to2}, %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+ %mask1 = icmp sge <2 x i64> %x1, %y1
+ %yb = load i64* %yb.ptr, align 4
+ %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
+ %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
+ %mask0 = icmp sgt <2 x i64> %x, %y
+ %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+ %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+ ret <2 x i64> %max
+}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
deleted file mode 100644
index 34aaf2c..0000000
--- a/test/CodeGen/X86/blend-msb.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-
-
-; Verify that we produce movss instead of blendvps when possible.
-
-;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blend
-;CHECK: movss
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
- ret <4 x float> %vsel
-}
-
-;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blend
-;CHECK: movss
-;CHECK: ret
-define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
- ret <4 x i8> %vsel
-}
-
-;CHECK-LABEL: vsel_8xi16:
-; The select mask is
-; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
-; which translates into the boolean mask (big endian representation):
-; 00010001 = 17.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 11101110 = 238.
-; According to the ABI:
-; v1 is in xmm0 => first argument is xmm0.
-; v2 is in xmm1 => second argument is xmm1.
-;CHECK: pblendw $238, %xmm1, %xmm0
-;CHECK: ret
-define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
- %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
- ret <8 x i16> %vsel
-}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 2681c10..cc40bcf 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -237,44 +237,6 @@ exit:
ret i32 %base
}
-define void @test_loop_rotate_reversed_blocks() {
-; This test case (greatly reduced from an Olden bencmark) ensures that the loop
-; rotate implementation doesn't assume that loops are laid out in a particular
-; order. The first loop will get split into two basic blocks, with the loop
-; header coming after the loop latch.
-;
-; CHECK: test_loop_rotate_reversed_blocks
-; CHECK: %entry
-; Look for a jump into the middle of the loop, and no branches mid-way.
-; CHECK: jmp
-; CHECK: %loop1
-; CHECK-NOT: j{{\w*}} .LBB{{.*}}
-; CHECK: %loop1
-; CHECK: je
-
-entry:
- %cond1 = load volatile i1* undef
- br i1 %cond1, label %loop2.preheader, label %loop1
-
-loop1:
- call i32 @f()
- %cond2 = load volatile i1* undef
- br i1 %cond2, label %loop2.preheader, label %loop1
-
-loop2.preheader:
- call i32 @f()
- %cond3 = load volatile i1* undef
- br i1 %cond3, label %exit, label %loop2
-
-loop2:
- call i32 @f()
- %cond4 = load volatile i1* undef
- br i1 %cond4, label %exit, label %loop2
-
-exit:
- ret void
-}
-
define i32 @test_loop_align(i32 %i, i32* %a) {
; Check that we provide basic loop body alignment with the block placement
; pass.
diff --git a/test/CodeGen/X86/byval-callee-cleanup.ll b/test/CodeGen/X86/byval-callee-cleanup.ll
new file mode 100644
index 0000000..8e059d4
--- /dev/null
+++ b/test/CodeGen/X86/byval-callee-cleanup.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+
+; Previously we would forget to align to stack slot alignment after placing a
+; byval argument. Subsequent arguments would align themselves, but if it was
+; the last argument, the argument size would not be a multiple of stack slot
+; size. This resulted in retl $6 in callee-cleanup functions, as well as subtle
+; varargs bugs.
+
+%struct.Six = type { [6 x i8] }
+
+define x86_stdcallcc void @f(%struct.Six* byval %a) {
+ ret void
+}
+; CHECK-LABEL: _f@8:
+; CHECK: retl $8
+
+define x86_thiscallcc void @g(i8* %this, %struct.Six* byval %a) {
+ ret void
+}
+; CHECK-LABEL: _g:
+; CHECK: retl $8
+
+define x86_fastcallcc void @h(i32 inreg %x, i32 inreg %y, %struct.Six* byval %a) {
+ ret void
+}
+; CHECK-LABEL: @h@16:
+; CHECK: retl $8
diff --git a/test/CodeGen/X86/cfi_enforcing.ll b/test/CodeGen/X86/cfi_enforcing.ll
new file mode 100644
index 0000000..bcad8c1
--- /dev/null
+++ b/test/CodeGen/X86/cfi_enforcing.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86-64 %s
+
+define void @indirect_fun() unnamed_addr jumptable {
+ ret void
+}
+
+define i32 @m(void ()* %fun) {
+ call void ()* %fun()
+; CHECK: subl
+; X86-64: andq $8,
+; X86-64: leaq __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]]
+; X86-64-NOT: callq __llvm_cfi_pointer_warning
+; X86-64: callq *[[REG]]
+; X86: andl $8,
+; X86: leal __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]]
+; X86-NOT: calll __llvm_cfi_pointer_warning
+; X86: calll *[[REG]]
+ ret i32 0
+}
+
+define void ()* @get_fun() {
+ ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+ %f = call void ()* ()* @get_fun()
+ %a = call i32 @m(void ()* %f)
+ ret i32 %a
+}
+
+; CHECK: .align 8
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK: jmp indirect_fun@PLT
diff --git a/test/CodeGen/X86/cfi_invoke.ll b/test/CodeGen/X86/cfi_invoke.ll
new file mode 100644
index 0000000..dd0d42a
--- /dev/null
+++ b/test/CodeGen/X86/cfi_invoke.ll
@@ -0,0 +1,35 @@
+; RUN: llc <%s -fcfi -cfi-type=sub | FileCheck %s
+; ModuleID = 'test.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @__gxx_personality_v0(...)
+
+@_ZTIPKc = external constant i8*
+@_ZTIi = external constant i8*
+
+define void @f() unnamed_addr jumptable {
+ ret void
+}
+
+@a = global void ()* @f
+
+; Make sure invoke gets targeted as well as regular calls
+define void @_Z3foov(void ()* %f) uwtable ssp {
+; CHECK-LABEL: _Z3foov:
+ entry:
+ invoke void %f()
+ to label %try.cont unwind label %lpad
+; CHECK: callq __llvm_cfi_pointer_warning
+; CHECK: callq *%rbx
+
+ lpad:
+ %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+ catch i8* bitcast (i8** @_ZTIi to i8*)
+ filter [1 x i8*] [i8* bitcast (i8** @_ZTIPKc to i8*)]
+ ret void
+
+ try.cont:
+ ret void
+}
+
diff --git a/test/CodeGen/X86/cfi_non_default_function.ll b/test/CodeGen/X86/cfi_non_default_function.ll
new file mode 100644
index 0000000..29774a1
--- /dev/null
+++ b/test/CodeGen/X86/cfi_non_default_function.ll
@@ -0,0 +1,27 @@
+; RUN: llc -fcfi -cfi-func-name=cfi_new_failure <%s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+define void @indirect_fun() unnamed_addr jumptable {
+ ret void
+}
+
+define i32 @m(void ()* %fun) {
+; CHECK-LABEL: @m
+ call void ()* %fun()
+; CHECK: callq cfi_new_failure
+ ret i32 0
+}
+
+define void ()* @get_fun() {
+ ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+ %f = call void ()* ()* @get_fun()
+ %a = call i32 @m(void ()* %f)
+ ret i32 %a
+}
+
+; CHECK: .align 8
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK: jmp indirect_fun@PLT
diff --git a/test/CodeGen/X86/cfi_simple_indirect_call.ll b/test/CodeGen/X86/cfi_simple_indirect_call.ll
new file mode 100644
index 0000000..0ee118d
--- /dev/null
+++ b/test/CodeGen/X86/cfi_simple_indirect_call.ll
@@ -0,0 +1,43 @@
+; RUN: llc -fcfi -cfi-type=sub <%s | FileCheck --check-prefix=SUB %s
+; RUN: llc -fcfi -cfi-type=add <%s | FileCheck --check-prefix=ADD %s
+; RUN: llc -fcfi -cfi-type=ror <%s | FileCheck --check-prefix=ROR %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @indirect_fun() unnamed_addr jumptable {
+ ret void
+}
+
+define i32 @m(void ()* %fun) {
+ call void ()* %fun()
+; SUB: subl
+; SUB: andq $8
+; SUB-LABEL: leaq __llvm_jump_instr_table_0_1
+; SUB-LABEL: callq __llvm_cfi_pointer_warning
+
+; ROR: subq
+; ROR: rolq $61
+; ROR: testq
+; ROR-LABEL: callq __llvm_cfi_pointer_warning
+
+; ADD: andq $8
+; ADD-LABEL: leaq __llvm_jump_instr_table_0_1
+; ADD: cmpq
+; ADD-LABEL: callq __llvm_cfi_pointer_warning
+ret i32 0
+}
+
+define void ()* @get_fun() {
+ ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+ %f = call void ()* ()* @get_fun()
+ %a = call i32 @m(void ()* %f)
+ ret i32 %a
+}
+; SUB: .text
+; SUB: .align 8
+; SUB-LABEL: .type __llvm_jump_instr_table_0_1,@function
+; SUB-LABEL:__llvm_jump_instr_table_0_1:
+; SUB-LABEL: jmp indirect_fun@PLT
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
new file mode 100644
index 0000000..3cb8b97
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
+
+declare i32 @bar()
+
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+; CHECK-LABEL: test_intervening_call:
+; CHECK: cmpxchg
+; CHECK: pushfq
+; CHECK: popq [[FLAGS:%.*]]
+
+; CHECK: callq bar
+
+; CHECK: pushq [[FLAGS]]
+; CHECK: popfq
+; CHECK: jne
+ %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+ %p = extractvalue { i64, i1 } %cx, 1
+ call i32 @bar()
+ br i1 %p, label %t, label %f
+
+t:
+ ret i64 42
+
+f:
+ ret i64 0
+}
+
+; Interesting in producing a clobber without any function calls.
+define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
+; CHECK-LABEL: test_control_flow:
+
+; CHECK: cmpxchg
+; CHECK-NEXT: jne
+entry:
+ %cmp = icmp sgt i32 %i, %j
+ br i1 %cmp, label %loop_start, label %cond.end
+
+loop_start:
+ br label %while.condthread-pre-split.i
+
+while.condthread-pre-split.i:
+ %.pr.i = load i32* %p, align 4
+ br label %while.cond.i
+
+while.cond.i:
+ %0 = phi i32 [ %.pr.i, %while.condthread-pre-split.i ], [ 0, %while.cond.i ]
+ %tobool.i = icmp eq i32 %0, 0
+ br i1 %tobool.i, label %while.cond.i, label %while.body.i
+
+while.body.i:
+ %.lcssa = phi i32 [ %0, %while.cond.i ]
+ %1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst
+ %2 = extractvalue { i32, i1 } %1, 1
+ br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i
+
+cond.end.loopexit:
+ br label %cond.end
+
+cond.end:
+ %cond = phi i32 [ %i, %entry ], [ 0, %cond.end.loopexit ]
+ ret i32 %cond
+}
+
+; This one is an interesting case because CMOV doesn't have a chain
+; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
+define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_feed_cmov:
+
+; CHECK: cmpxchg
+; CHECK: pushfq
+; CHECK: popq [[FLAGS:%.*]]
+
+; CHECK: callq bar
+
+; CHECK: pushq [[FLAGS]]
+; CHECK: popfq
+
+ %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+ %success = extractvalue { i32, i1 } %res, 1
+
+ %rhs = call i32 @bar()
+
+ %ret = select i1 %success, i32 %new, i32 %rhs
+ ret i32 %ret
+}
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index 78e1dd2..85bfff2 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,7 +1,7 @@
; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
; This file tests the different cases what are involved when codegen prepare
-; tries to get sign extension out of the way of addressing mode.
+; tries to get sign/zero extension out of the way of addressing mode.
; This tests require an actual target as addressing mode decisions depends
; on the target.
@@ -67,6 +67,43 @@ define i8 @oneArgPromotion(i32 %arg1, i8* %base) {
ret i8 %res
}
+; Check that we are able to merge a sign extension with a zero extension.
+; CHECK-LABEL: @oneArgPromotionZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionZExt(i8 %arg1, i8* %base) {
+ %zext = zext i8 %arg1 to i32
+ %add = add nsw i32 %zext, 1
+ %sextadd = sext i32 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
+
+; When promoting a constant zext, the IR builder returns a constant,
+; not an instruction. Make sure this is properly handled. This used
+; to crash.
+; Note: The constant zext is promoted, but does not help matching
+; more thing in the addressing mode. Therefore the modification is
+; rolled back.
+; Still, this test case exercises the desired code path.
+; CHECK-LABEL: @oneArgPromotionCstZExt
+; CHECK: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i16 undef to i32
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXT]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionCstZExt(i8* %base) {
+ %cst = zext i16 undef to i32
+ %add = add nsw i32 %cst, 1
+ %sextadd = sext i32 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
+
; Check that we do not promote truncate when we cannot determine the
; bits that are dropped.
; CHECK-LABEL: @oneArgPromotionBlockTrunc1
@@ -321,3 +358,177 @@ end:
%final = load i32* %addr
ret i32 %final
}
+
+%struct.dns_packet = type { i32, i32, %union.anon }
+%union.anon = type { i32 }
+
+@a = common global i32 0, align 4
+@b = common global i16 0, align 2
+
+; We used to crash on this function because we did not return the right
+; promoted instruction for %conv.i.
+; Make sure we generate the right code now.
+; CHECK-LABEL: @fn3
+; %conv.i is used twice and only one of its use is being promoted.
+; Use it at the starting point for the matching.
+; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
+; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
+; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = ptrtoint %struct.dns_packet* %P to i64
+; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add i64 [[BASE]], [[PROMOTED_CONV]]
+; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = add i64 [[ADD]], 7
+; CHECK-NEXT: [[CAST:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[ADDR]] to i8*
+; CHECK-NEXT: load i8* [[CAST]], align 1
+define signext i16 @fn3(%struct.dns_packet* nocapture readonly %P) {
+entry:
+ %tmp = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 2
+ %data.i.i = bitcast %union.anon* %tmp to [0 x i8]*
+ br label %while.body.i.i
+
+while.body.i.i: ; preds = %while.body.i.i, %entry
+ %src.addr.0.i.i = phi i16 [ 0, %entry ], [ %inc.i.i, %while.body.i.i ]
+ %inc.i.i = add i16 %src.addr.0.i.i, 1
+ %idxprom.i.i = sext i16 %src.addr.0.i.i to i64
+ %arrayidx.i.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i.i
+ %tmp1 = load i8* %arrayidx.i.i, align 1
+ %conv2.i.i = zext i8 %tmp1 to i32
+ %and.i.i = and i32 %conv2.i.i, 15
+ store i32 %and.i.i, i32* @a, align 4
+ %tobool.i.i = icmp eq i32 %and.i.i, 0
+ br i1 %tobool.i.i, label %while.body.i.i, label %fn1.exit.i
+
+fn1.exit.i: ; preds = %while.body.i.i
+ %inc.i.i.lcssa = phi i16 [ %inc.i.i, %while.body.i.i ]
+ %conv.i = zext i16 %inc.i.i.lcssa to i32
+ %sub.i = add nsw i32 %conv.i, -1
+ %idxprom.i = sext i32 %sub.i to i64
+ %arrayidx.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i
+ %tmp2 = load i8* %arrayidx.i, align 1
+ %conv2.i = sext i8 %tmp2 to i16
+ store i16 %conv2.i, i16* @b, align 2
+ %sub4.i = sub nsw i32 0, %conv.i
+ %conv5.i = zext i16 %conv2.i to i32
+ %cmp.i = icmp sgt i32 %conv5.i, %sub4.i
+ br i1 %cmp.i, label %if.then.i, label %fn2.exit
+
+if.then.i: ; preds = %fn1.exit.i
+ %end.i = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 1
+ %tmp3 = load i32* %end.i, align 4
+ %sub7.i = add i32 %tmp3, 65535
+ %conv8.i = trunc i32 %sub7.i to i16
+ br label %fn2.exit
+
+fn2.exit: ; preds = %if.then.i, %fn1.exit.i
+ %retval.0.i = phi i16 [ %conv8.i, %if.then.i ], [ undef, %fn1.exit.i ]
+ ret i16 %retval.0.i
+}
+
+; Check that we do not promote an extension if the non-wrapping flag does not
+; match the kind of the extension.
+; CHECK-LABEL: @noPromotionFlag
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
+ %add = add nsw i32 %arg1, %arg2
+ %zextadd = zext i32 %add to i64
+ %base = inttoptr i64 %zextadd to i8*
+ %res = load i8* %base
+ ret i8 %res
+}
+
+; Check that we correctly promote both operands of the promotable add with zext.
+; CHECK-LABEL: @twoArgsPromotionZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64
+; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]]
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
+ %add = add nuw i32 %arg1, %arg2
+ %zextadd = zext i32 %add to i64
+ %base = inttoptr i64 %zextadd to i8*
+ %res = load i8* %base
+ ret i8 %res
+}
+
+; Check that we correctly promote constant arguments.
+; CHECK-LABEL: @oneArgPromotionNegativeCstZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, i8* %base) {
+ %add = add nuw i8 %arg1, -1
+ %zextadd = zext i8 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
+
+; Check that we are able to merge two zero extensions.
+; CHECK-LABEL: @oneArgPromotionZExtZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionZExtZExt(i8 %arg1, i8* %base) {
+ %zext = zext i8 %arg1 to i32
+ %add = add nuw i32 %zext, 1
+ %zextadd = zext i32 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
+
+; Check that we do not promote truncate when the dropped bits
+; are of a different kind.
+; CHECK-LABEL: @oneArgPromotionBlockTruncZExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, i8* %base) {
+ %sextarg1 = sext i1 %arg1 to i32
+ %trunc = trunc i32 %sextarg1 to i8
+ %add = add nuw i8 %trunc, 1
+ %zextadd = zext i8 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
+
+; Check that we are able to promote truncate when we know all the bits
+; that are dropped.
+; CHECK-LABEL: @oneArgPromotionPassTruncZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, i8* %base) {
+ %sextarg1 = zext i1 %arg1 to i32
+ %trunc = trunc i32 %sextarg1 to i8
+ %add = add nuw i8 %trunc, 1
+ %zextadd = zext i8 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
+
+; Check that we do not promote sext with zext.
+; CHECK-LABEL: @oneArgPromotionBlockSExtZExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, i8* %base) {
+ %sextarg1 = sext i1 %arg1 to i8
+ %add = add nuw i8 %sextarg1, 1
+ %zextadd = zext i8 %add to i64
+ %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+ %res = load i8* %arrayidx
+ ret i8 %res
+}
diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll
index bf27b2f..ac4546d 100644
--- a/test/CodeGen/X86/coff-comdat.ll
+++ b/test/CodeGen/X86/coff-comdat.ll
@@ -73,19 +73,19 @@ $vftable = comdat largest
; CHECK: .globl @v8@0
; CHECK: .section .text,"xr",discard,@f8@0
; CHECK: .globl @f8@0
-; CHECK: .section .bss,"bw",associative,_f1
+; CHECK: .section .bss,"wb",associative,_f1
; CHECK: .globl _v1
-; CHECK: .section .bss,"bw",associative,_f2
+; CHECK: .section .bss,"wb",associative,_f2
; CHECK: .globl _v2
-; CHECK: .section .bss,"bw",associative,_f3
+; CHECK: .section .bss,"wb",associative,_f3
; CHECK: .globl _v3
-; CHECK: .section .bss,"bw",associative,_f4
+; CHECK: .section .bss,"wb",associative,_f4
; CHECK: .globl _v4
-; CHECK: .section .bss,"bw",associative,_f5
+; CHECK: .section .bss,"wb",associative,_f5
; CHECK: .globl _v5
-; CHECK: .section .bss,"bw",associative,_f6
+; CHECK: .section .bss,"wb",associative,_f6
; CHECK: .globl _v6
-; CHECK: .section .bss,"bw",same_size,_f6
+; CHECK: .section .bss,"wb",same_size,_f6
; CHECK: .globl _f6
; CHECK: .section .rdata,"rd",largest,_vftable
; CHECK: .globl _vftable
diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll
index 6744b5b..58bc04e 100644
--- a/test/CodeGen/X86/coff-comdat2.ll
+++ b/test/CodeGen/X86/coff-comdat2.ll
@@ -6,4 +6,4 @@ target triple = "i686-pc-windows-msvc"
$foo = comdat largest
@foo = global i32 0
@bar = global i32 0, comdat $foo
-; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT.
+; CHECK: Associative COMDAT symbol 'foo' is not a key for its COMDAT.
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
new file mode 100644
index 0000000..59a7a19
--- /dev/null
+++ b/test/CodeGen/X86/combine-and.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
+;
+; Verify that the DAGCombiner is able to fold a vector AND into a blend
+; if one of the operands to the AND is a vector of all constants, and each
+; constant element is either zero or all-ones.
+
+
+define <4 x i32> @test1(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test1
+; CHECK: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test2(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test2
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test3(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test3
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test4(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test5(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test6(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test7(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test7
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test8(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test8
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test9(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test9
+; CHECK: movq %xmm0, %xmm0
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test10(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test10
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test11(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test11
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test12(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test12
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test13(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test13
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test14(<4 x i32> %A) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test14
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
+ %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0>
+ %3 = or <4 x i32> %1, %2
+ ret <4 x i32> %3
+}
+; CHECK-LABEL: test15
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
+ %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
+ %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1>
+ %3 = or <4 x i32> %1, %2
+ ret <4 x i32> %3
+}
+; CHECK-LABEL: test16
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
+ %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
+ %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0>
+ %3 = or <4 x i32> %1, %2
+ ret <4 x i32> %3
+}
+; CHECK-LABEL: test17
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index ff807b9..9539eae 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -5,265 +5,293 @@
; instruction which performs a blend operation.
define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
-; CHECK-LABEL: test1
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NOT: orps
-; CHECK: ret
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test2
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
-; CHECK-LABEL: test3
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NEXT: ret
define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test4
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test5
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NEXT: ret
define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test6:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test6
-; CHECK-NOT: xorps
-; CHECK: blendps $12
-; CHECK-NEXT: ret
define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test7:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
-; CHECK-LABEL: test7
-; CHECK-NOT: xorps
-; CHECK: blendps $12
-; CHECK-NEXT: ret
define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test8:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
%and1 = and <2 x i64> %a, <i64 -1, i64 0>
%and2 = and <2 x i64> %b, <i64 0, i64 -1>
%or = or <2 x i64> %and1, %and2
ret <2 x i64> %or
}
-; CHECK-LABEL: test8
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NOT: orps
-; CHECK: ret
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test9:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
-; CHECK-LABEL: test9
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test10:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
%and1 = and <2 x i64> %a, <i64 0, i64 -1>
%and2 = and <2 x i64> %b, <i64 -1, i64 0>
%or = or <2 x i64> %and1, %and2
ret <2 x i64> %or
}
-; CHECK-LABEL: test10
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NEXT: ret
define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test11:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
-; CHECK-LABEL: test11
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test12:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
%or = or <4 x i32> %and1, %and2
ret <4 x i32> %or
}
-; CHECK-LABEL: test12
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NEXT: ret
; Verify that the following test cases are folded into single shuffles.
define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test13:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test13
-; CHECK-NOT: xorps
-; CHECK: shufps
-; CHECK-NEXT: ret
define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test14:
+; CHECK: # BB#0:
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
-; CHECK-LABEL: test14
-; CHECK-NOT: pslldq
-; CHECK-NOT: por
-; CHECK: punpcklqdq
-; CHECK-NEXT: ret
define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test15:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test15
-; CHECK-NOT: xorps
-; CHECK: shufps
-; CHECK-NOT: shufps
-; CHECK-NOT: orps
-; CHECK: ret
define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test16:
+; CHECK: # BB#0:
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
-; CHECK-LABEL: test16
-; CHECK-NOT: pslldq
-; CHECK-NOT: por
-; CHECK: punpcklqdq
-; CHECK: ret
; Verify that the dag-combiner does not fold a OR of two shuffles into a single
; shuffle instruction when the shuffle indexes are not compatible.
define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test17:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm2, %xmm2
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0]
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2]
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; CHECK-NEXT: orps %xmm1, %xmm2
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test17
-; CHECK: por
-; CHECK-NEXT: ret
define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test18:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm2, %xmm2
+; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test18
-; CHECK: orps
-; CHECK: ret
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test19:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm2, %xmm2
+; CHECK-NEXT: xorps %xmm3, %xmm3
+; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
+; CHECK-NEXT: orps %xmm3, %xmm2
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
-; CHECK-LABEL: test19
-; CHECK: por
-; CHECK-NEXT: ret
define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test20:
+; CHECK: # BB#0:
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: movq %xmm0, %xmm0
+; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
-; CHECK-LABEL: test20
-; CHECK-NOT: xorps
-; CHECK: orps
-; CHECK-NEXT: movq
-; CHECK-NEXT: ret
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test21:
+; CHECK: # BB#0:
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: movq %xmm0, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%or = or <2 x i64> %shuf1, %shuf2
ret <2 x i64> %or
}
-; CHECK-LABEL: test21
-; CHECK: por
-; CHECK-NEXT: pslldq
-; CHECK-NEXT: ret
+; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
+; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
+; handle legal vector value types.
+define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: test_crash:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+ %or = or <4 x i8> %shuf1, %shuf2
+ ret <4 x i8> %or
+}
diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll
deleted file mode 100644
index 7ab7f80..0000000
--- a/test/CodeGen/X86/combine-vec-shuffle-2.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
-
-; Check that DAGCombiner correctly folds the following pairs of shuffles
-; using the following rules:
-; 1. shuffle(shuffle(x, y), undef) -> x
-; 2. shuffle(shuffle(x, y), undef) -> y
-; 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
-; 4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y)
-;
-; Rules 3. and 4. are used only if the resulting shuffle mask is legal.
-
-define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test1
-; Mask: [3,0,0,1]
-; CHECK: pshufd $67
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test2
-; Mask: [2,0,0,3]
-; CHECK: pshufd $-62
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test3
-; Mask: [2,0,0,3]
-; CHECK: pshufd $-62
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test4
-; Mask: [0,0,0,1]
-; CHECK: pshufd $64
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test5
-; Mask: [1,1]
-; CHECK: movhlps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test6
-; Mask: [2,0,0,0]
-; CHECK: pshufd $2
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test7
-; Mask: [0,2,0,2]
-; CHECK: pshufd $-120
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test8
-; Mask: [1,0,3,0]
-; CHECK: pshufd $49
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test9
-; Mask: [1,3,0,2]
-; CHECK: pshufd $-115
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test10
-; Mask: [1,0,1,0]
-; CHECK: pshufd $17
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test11
-; Mask: [1,0,2,1]
-; CHECK: pshufd $97
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test12
-; Mask: [0,0,0,0]
-; CHECK: pshufd $0
-; CHECK-NEXT: ret
-
-
-; The following pair of shuffles is folded into vector %A.
-define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test13
-; CHECK-NOT: pshufd
-; CHECK: ret
-
-
-; The following pair of shuffles is folded into vector %B.
-define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) {
- %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
- %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
- ret <4 x i32> %2
-}
-; CHECK-LABEL: test14
-; CHECK-NOT: pshufd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll
deleted file mode 100644
index 9e6ab89..0000000
--- a/test/CodeGen/X86/combine-vec-shuffle.ll
+++ /dev/null
@@ -1,253 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
-
-; Verify that the DAGCombiner correctly folds according to the following rules:
-
-; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
-; fold (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C)
-; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
-
-; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
-; fold (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B))
-; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
-
-
-
-define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
- %and = and <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %and
-}
-; CHECK-LABEL: test1
-; CHECK-NOT: pshufd
-; CHECK: pand
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
- %or = or <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %or
-}
-; CHECK-LABEL: test2
-; CHECK-NOT: pshufd
-; CHECK: por
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
- %xor = xor <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3
-; CHECK-NOT: pshufd
-; CHECK: pxor
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
- %and = and <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %and
-}
-; CHECK-LABEL: test4
-; CHECK-NOT: pshufd
-; CHECK: pand
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
- %or = or <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %or
-}
-; CHECK-LABEL: test5
-; CHECK-NOT: pshufd
-; CHECK: por
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
- %xor = xor <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6
-; CHECK-NOT: pshufd
-; CHECK: pxor
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
-; are not performing a swizzle operations.
-
-define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %and = and <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %and
-}
-; CHECK-LABEL: test1b
-; CHECK-NOT: blendps
-; CHECK: andps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %or = or <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %or
-}
-; CHECK-LABEL: test2b
-; CHECK-NOT: blendps
-; CHECK: orps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %xor = xor <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3b
-; CHECK-NOT: blendps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %and = and <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %and
-}
-; CHECK-LABEL: test4b
-; CHECK-NOT: blendps
-; CHECK: andps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-
-define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %or = or <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %or
-}
-; CHECK-LABEL: test5b
-; CHECK-NOT: blendps
-; CHECK: orps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-
-define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
- %xor = xor <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6b
-; CHECK-NOT: blendps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %and = and <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %and
-}
-; CHECK-LABEL: test1c
-; CHECK-NOT: shufps
-; CHECK: andps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %or = or <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %or
-}
-; CHECK-LABEL: test2c
-; CHECK-NOT: shufps
-; CHECK: orps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %xor = xor <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3c
-; CHECK-NOT: shufps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %and = and <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %and
-}
-; CHECK-LABEL: test4c
-; CHECK-NOT: shufps
-; CHECK: andps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
-
-define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %or = or <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %or
-}
-; CHECK-LABEL: test5c
-; CHECK-NOT: shufps
-; CHECK: orps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
-
-define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
- %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
- %xor = xor <4 x i32> %shuf1, %shuf2
- ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6c
-; CHECK-NOT: shufps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll
new file mode 100644
index 0000000..d06c6da
--- /dev/null
+++ b/test/CodeGen/X86/commute-blend-avx2.ll
@@ -0,0 +1,89 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s
+
+define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
+ %1 = load <8 x i16>* %b
+ %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
+ ret <8 x i16> %2
+
+ ;LABEL: commute_fold_vpblendw_128
+ ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+ ;CHECK-NEXT: retq
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
+ %1 = load <16 x i16>* %b
+ %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
+ ret <16 x i16> %2
+
+ ;LABEL: commute_fold_vpblendw_256
+ ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
+ ;CHECK-NEXT: retq
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
+
+define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
+ %1 = load <4 x i32>* %b
+ %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
+ ret <4 x i32> %2
+
+ ;LABEL: commute_fold_vpblendd_128
+ ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+ ;CHECK-NEXT: retq
+}
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
+ %1 = load <8 x i32>* %b
+ %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
+ ret <8 x i32> %2
+
+ ;LABEL: commute_fold_vpblendd_256
+ ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
+ ;CHECK-NEXT: retq
+}
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+
+define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
+ %1 = load <4 x float>* %b
+ %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
+ ret <4 x float> %2
+
+ ;LABEL: commute_fold_vblendps_128
+ ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+ ;CHECK-NEXT: retq
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
+ %1 = load <8 x float>* %b
+ %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
+ ret <8 x float> %2
+
+ ;LABEL: commute_fold_vblendps_256
+ ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
+ ;CHECK-NEXT: retq
+}
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
+ %1 = load <2 x double>* %b
+ %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
+ ret <2 x double> %2
+
+ ;LABEL: commute_fold_vblendpd_128
+ ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+ ;CHECK-NEXT: retq
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
+ %1 = load <4 x double>* %b
+ %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
+ ret <4 x double> %2
+
+ ;LABEL: commute_fold_vblendpd_256
+ ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
+ ;CHECK-NEXT: retq
+}
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll
new file mode 100644
index 0000000..59fef8c
--- /dev/null
+++ b/test/CodeGen/X86/commute-blend-sse41.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
+
+define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
+ %1 = load <8 x i16>* %b
+ %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
+ ret <8 x i16> %2
+
+ ;LABEL: commute_fold_pblendw
+ ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+ ;CHECK-NEXT: retq
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
+ %1 = load <4 x float>* %b
+ %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
+ ret <4 x float> %2
+
+ ;LABEL: commute_fold_blendps
+ ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+ ;CHECK-NEXT: retq
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
+ %1 = load <2 x double>* %b
+ %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
+ ret <2 x double> %2
+
+ ;LABEL: commute_fold_vblendpd
+ ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+ ;CHECK-NEXT: retq
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commuted-blend-mask.ll b/test/CodeGen/X86/commuted-blend-mask.ll
new file mode 100644
index 0000000..e6322cb
--- /dev/null
+++ b/test/CodeGen/X86/commuted-blend-mask.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
+
+; When commuting the operands of a SSE blend, make sure that the resulting blend
+; mask can be encoded as a imm8.
+; Before, when commuting the operands to the shuffle in function @test, the backend
+; produced the following assembly:
+; pblendw $4294967103, %xmm1, %xmm0
+
+define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
+ ;CHECK: pblendw $63, %xmm1, %xmm0
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+ ret <4 x i32> %shuffle
+}
diff --git a/test/CodeGen/X86/constant-pool-remat-0.ll b/test/CodeGen/X86/constant-pool-remat-0.ll
index 4a01108..e42a87c 100644
--- a/test/CodeGen/X86/constant-pool-remat-0.ll
+++ b/test/CodeGen/X86/constant-pool-remat-0.ll
@@ -1,7 +1,7 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux -regalloc=greedy | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-linux -mattr=+sse2 | FileCheck %s
; CHECK: LCPI
; CHECK: LCPI
; CHECK: LCPI
diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll
index 26318dd..3682165 100644
--- a/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/test/CodeGen/X86/constant-pool-sharing.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=LINUX
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=MSVC
; llc should share constant pool entries between this integer vector
; and this floating-point vector since they have the same encoding.
-; CHECK: LCPI0_0(%rip), %xmm0
-; CHECK: movaps %xmm0, ({{%rdi|%rcx}})
-; CHECK: movaps %xmm0, ({{%rsi|%rdx}})
+; LINUX: LCPI0_0(%rip), %xmm0
+; MSVC: __xmm@40000000400000004000000040000000(%rip), %xmm0
+; COMMON: movaps %xmm0, ({{%rdi|%rcx}})
+; COMMON: movaps %xmm0, ({{%rsi|%rdx}})
define void @foo(<4 x i32>* %p, <4 x float>* %q, i1 %t) nounwind {
entry:
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index b578896..7160dcc 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -1,6 +1,8 @@
-; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s
-; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s
-@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }]
+; RUN: llc -mtriple x86_64-pc-linux -use-ctors < %s | FileCheck --check-prefix=CTOR %s
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s
+@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }]
+
+@v = weak_odr global i8 0
define void @f() {
entry:
@@ -12,14 +14,14 @@ entry:
ret void
}
-; CTOR: .section .ctors.65520,"aw",@progbits
+; CTOR: .section .ctors.65520,"aGw",@progbits,v,comdat
; CTOR-NEXT: .align 8
; CTOR-NEXT: .quad g
; CTOR-NEXT: .section .ctors,"aw",@progbits
; CTOR-NEXT: .align 8
; CTOR-NEXT: .quad f
-; INIT-ARRAY: .section .init_array.15,"aw",@init_array
+; INIT-ARRAY: .section .init_array.15,"aGw",@init_array,v,comdat
; INIT-ARRAY-NEXT: .align 8
; INIT-ARRAY-NEXT: .quad g
; INIT-ARRAY-NEXT: .section .init_array,"aw",@init_array
diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll
index 951b5c3..4d920e2 100644
--- a/test/CodeGen/X86/cvt16.ll
+++ b/test/CodeGen/X86/cvt16.ll
@@ -21,7 +21,7 @@
define void @test1(float %src, i16* %dest) {
- %1 = tail call i16 @llvm.convert.to.fp16(float %src)
+ %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
store i16 %1, i16* %dest, align 2
ret void
}
@@ -34,7 +34,7 @@ define void @test1(float %src, i16* %dest) {
define float @test2(i16* nocapture %src) {
%1 = load i16* %src, align 2
- %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+ %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
ret float %2
}
; CHECK-LABEL: test2:
@@ -45,8 +45,8 @@ define float @test2(i16* nocapture %src) {
define float @test3(float %src) nounwind uwtable readnone {
- %1 = tail call i16 @llvm.convert.to.fp16(float %src)
- %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+ %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
+ %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
ret float %2
}
@@ -59,6 +59,31 @@ define float @test3(float %src) nounwind uwtable readnone {
; F16C-NEXT: vcvtph2ps
; F16C: ret
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+define double @test4(i16* nocapture %src) {
+ %1 = load i16* %src, align 2
+ %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1)
+ ret double %2
+}
+; CHECK-LABEL: test4:
+; LIBCALL: callq __gnu_h2f_ieee
+; LIBCALL: cvtss2sd
+; SOFTFLOAT: callq __gnu_h2f_ieee
+; SOFTFLOAT: callq __extendsfdf2
+; F16C: vcvtph2ps
+; F16C: vcvtss2sd
+; F16C: ret
+
+
+define i16 @test5(double %src) {
+ %val = tail call i16 @llvm.convert.to.fp16.f64(double %src)
+ ret i16 %val
+}
+; CHECK-LABEL: test5:
+; LIBCALL: jmp __truncdfhf2
+; SOFTFLOAT: callq __truncdfhf2
+; F16C: jmp __truncdfhf2
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 4912213..d0791dc 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -52,48 +52,48 @@ define void @_Z3barii(i32 %param1, i32 %param2) #0 {
entry:
%var1 = alloca %struct.AAA3, align 1
%var2 = alloca %struct.AAA3, align 1
- tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30), !dbg !47
- tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31), !dbg !47
- tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32), !dbg !49
+ tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !47
+ tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !47
+ tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
%tobool = icmp eq i32 %param2, 0, !dbg !50
br i1 %tobool, label %if.end, label %if.then, !dbg !50
if.then: ; preds = %entry
%call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
- tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32), !dbg !49
+ tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
br label %if.end, !dbg !54
if.end: ; preds = %entry, %if.then
- tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
- tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56), !dbg !57
- tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59), !dbg !60
+ tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
+ tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56, metadata !{metadata !"0x102"}), !dbg !57
+ tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
%arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64), !dbg !65
- call void @llvm.dbg.value(metadata !58, i64 0, metadata !66), !dbg !67
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !65
+ call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{metadata !"0x102"}), !dbg !67
%arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68
%tobool1 = icmp eq i32 %param1, 0, !dbg !69
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
if.then2: ; preds = %if.end
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71), !dbg !73
- call void @llvm.dbg.value(metadata !74, i64 0, metadata !75), !dbg !76
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71, metadata !{metadata !"0x102"}), !dbg !73
+ call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{metadata !"0x102"}), !dbg !76
call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
br label %if.end3, !dbg !72
if.else: ; preds = %if.end
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77), !dbg !79
- call void @llvm.dbg.value(metadata !80, i64 0, metadata !81), !dbg !82
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77, metadata !{metadata !"0x102"}), !dbg !79
+ call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{metadata !"0x102"}), !dbg !82
call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
br label %if.end3
if.end3: ; preds = %if.else, %if.then2
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
- call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83), !dbg !85
- call void @llvm.dbg.value(metadata !58, i64 0, metadata !86), !dbg !87
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
+ call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83, metadata !{metadata !"0x102"}), !dbg !85
+ call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{metadata !"0x102"}), !dbg !87
call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87
ret void, !dbg !88
}
@@ -103,7 +103,7 @@ declare i8* @_Z5i2stri(i32) #1
declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -113,92 +113,92 @@ attributes #2 = { nounwind readnone }
!llvm.module.flags = !{!44, !45}
!llvm.ident = !{!46}
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
!1 = metadata !{metadata !"dbg-changes-codegen-branch-folding.cpp", metadata !"/tmp/dbginfo"}
!2 = metadata !{}
!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"AAA3", i32 4, i64 32, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00AAA3\004\0032\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
!5 = metadata !{metadata !6, metadata !11, metadata !17, metadata !18}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS4AAA3", metadata !"text", i32 8, i64 32, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
-!7 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
-!8 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!6 = metadata !{metadata !"0xd\00text\008\0032\008\000\000", metadata !1, metadata !"_ZTS4AAA3", metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
+!7 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
!9 = metadata !{metadata !10}
-!10 = metadata !{i32 786465, i64 0, i64 4} ; [ DW_TAG_subrange_type ] [0, 3]
-!11 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"", i32 5, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x21\000\004"} ; [ DW_TAG_subrange_type ] [0, 3]
+!11 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!13 = metadata !{null, metadata !14, metadata !15}
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!16 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
-!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [operator=]
-!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator const char *", metadata !"operator const char *", metadata !"_ZNK4AAA3cvPKcEv", i32 7, metadata !19, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
-!19 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!17 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=]
+!18 = metadata !{metadata !"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", metadata !1, metadata !"_ZTS4AAA3", metadata !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!20 = metadata !{metadata !15, metadata !21}
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
+!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
!23 = metadata !{metadata !24, metadata !35, metadata !40}
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"bar", metadata !"bar", metadata !"_Z3barii", i32 11, metadata !26, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z3barii, null, null, metadata !29, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
-!25 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", metadata !1, metadata !25, metadata !26, null, void (i32, i32)* @_Z3barii, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!25 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!27 = metadata !{null, metadata !28, metadata !28}
-!28 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!28 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
!29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33, metadata !34}
-!30 = metadata !{i32 786689, metadata !24, metadata !"param1", metadata !25, i32 16777227, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param1] [line 11]
-!31 = metadata !{i32 786689, metadata !24, metadata !"param2", metadata !25, i32 33554443, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param2] [line 11]
-!32 = metadata !{i32 786688, metadata !24, metadata !"temp", metadata !25, i32 12, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 12]
-!33 = metadata !{i32 786688, metadata !24, metadata !"var1", metadata !25, i32 17, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var1] [line 17]
-!34 = metadata !{i32 786688, metadata !24, metadata !"var2", metadata !25, i32 18, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 18]
-!35 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
+!30 = metadata !{metadata !"0x101\00param1\0016777227\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param1] [line 11]
+!31 = metadata !{metadata !"0x101\00param2\0033554443\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param2] [line 11]
+!32 = metadata !{metadata !"0x100\00temp\0012\000", metadata !24, metadata !25, metadata !15} ; [ DW_TAG_auto_variable ] [temp] [line 12]
+!33 = metadata !{metadata !"0x100\00var1\0017\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17]
+!34 = metadata !{metadata !"0x100\00var2\0018\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18]
+!35 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !17, metadata !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
!36 = metadata !{metadata !37, metadata !39}
-!37 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!38 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
-!39 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!40 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"_ZN4AAA3C2EPKc", i32 5, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !11, metadata !41, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
+!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
+!39 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!40 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !11, metadata !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
!41 = metadata !{metadata !42, metadata !43}
-!42 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!43 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!42 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!43 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 5]
!44 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
!46 = metadata !{metadata !"clang version 3.5.0 "}
!47 = metadata !{i32 11, i32 0, metadata !24, null}
!48 = metadata !{i8* null}
!49 = metadata !{i32 12, i32 0, metadata !24, null}
!50 = metadata !{i32 14, i32 0, metadata !51, null}
-!51 = metadata !{i32 786443, metadata !1, metadata !24, i32 14, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!51 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
!52 = metadata !{i32 15, i32 0, metadata !53, null}
-!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 14, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!53 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
!54 = metadata !{i32 16, i32 0, metadata !53, null}
!55 = metadata !{i32 17, i32 0, metadata !24, null}
-!56 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!56 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
!57 = metadata !{i32 0, i32 0, metadata !40, metadata !55}
!58 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
-!59 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!59 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
!60 = metadata !{i32 5, i32 0, metadata !40, metadata !55}
!61 = metadata !{i32 5, i32 0, metadata !62, metadata !55}
-!62 = metadata !{i32 786443, metadata !1, metadata !40, i32 5, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!62 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
!63 = metadata !{i32 18, i32 0, metadata !24, null}
-!64 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!64 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
!65 = metadata !{i32 0, i32 0, metadata !40, metadata !63}
-!66 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!66 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
!67 = metadata !{i32 5, i32 0, metadata !40, metadata !63}
!68 = metadata !{i32 5, i32 0, metadata !62, metadata !63}
!69 = metadata !{i32 20, i32 0, metadata !70, null}
-!70 = metadata !{i32 786443, metadata !1, metadata !24, i32 20, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!71 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!70 = metadata !{metadata !"0xb\0020\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!71 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
!72 = metadata !{i32 21, i32 0, metadata !70, null}
!73 = metadata !{i32 0, i32 0, metadata !35, metadata !72}
!74 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)}
-!75 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!75 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
!76 = metadata !{i32 6, i32 0, metadata !35, metadata !72}
-!77 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!77 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
!78 = metadata !{i32 23, i32 0, metadata !70, null}
!79 = metadata !{i32 0, i32 0, metadata !35, metadata !78}
!80 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)}
-!81 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!81 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
!82 = metadata !{i32 6, i32 0, metadata !35, metadata !78}
-!83 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!83 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
!84 = metadata !{i32 24, i32 0, metadata !24, null}
!85 = metadata !{i32 0, i32 0, metadata !35, metadata !84}
-!86 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!86 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
!87 = metadata !{i32 6, i32 0, metadata !35, metadata !84}
!88 = metadata !{i32 25, i32 0, metadata !24, null}
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index 0b17c45..aae95e8 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -44,7 +44,7 @@
define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
entry:
%0 = load %struct.Foo** @pfoo, align 8
- tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62)
+ tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62, metadata !{metadata !"0x102"})
%cmp.i = icmp eq %struct.Foo* %0, %this
ret i1 %cmp.i
}
@@ -53,7 +53,7 @@ entry:
define void @_Z3bazv() #1 {
entry:
%0 = load %struct.Wibble** @wibble1, align 8
- tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65)
+ tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65, metadata !{metadata !"0x102"})
%1 = load %struct.Wibble** @wibble2, align 8
%cmp.i = icmp ugt %struct.Wibble* %1, %0
br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -69,15 +69,15 @@ _ZN7Flibble3barEP6Wibble.exit: ; preds = %entry, %if.then.i
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
-!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
-!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
-!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4]
+!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
+!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
+!62 = metadata !{metadata !"0x101\00arg\0033554436\000", null, null, metadata !17} ; [ DW_TAG_arg_variable ] [arg] [line 4]
!64 = metadata !{%struct.Flibble* undef}
-!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13]
+!65 = metadata !{metadata !"0x101\00this\0016777229\001088", null, null, metadata !45} ; [ DW_TAG_arg_variable ] [this] [line 13]
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 21225e3..fd07a3f 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -31,6 +31,7 @@ entry:
; CHECK-LABEL: test3:
; CHECK: movzbl 8(%esp), %eax
; CHECK-NEXT: imull $171, %eax
+; CHECK-NEXT: andl $65024, %eax
; CHECK-NEXT: shrl $9, %eax
; CHECK-NEXT: ret
}
@@ -56,9 +57,10 @@ entry:
%div = sdiv i16 %x, 10
ret i16 %div
; CHECK-LABEL: test6:
-; CHECK: imull $26215, %eax, %ecx
-; CHECK: sarl $18, %ecx
-; CHECK: shrl $15, %eax
+; CHECK: imull $26215, %eax
+; CHECK: movl %eax, %ecx
+; CHECK: shrl $31, %ecx
+; CHECK: sarl $18, %eax
}
define i32 @test7(i32 %x) nounwind {
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
new file mode 100644
index 0000000..ec367c8
--- /dev/null
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64
+; RUN: llc -march=x86 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_udivrem_zext_ah
+; CHECK: divb
+; CHECK: movzbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK: movb %al, ([[REG_ZPTR:%[a-z0-9]+]])
+; CHECK: movl [[REG_REM]], %eax
+; CHECK: ret
+ %div = udiv i8 %x, %y
+ store i8 %div, i8* @z
+ %1 = urem i8 %x, %y
+ ret i8 %1
+}
+
+define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext_ah
+; CHECK: divb
+; CHECK: movzbl %ah, %eax
+; CHECK: ret
+ %1 = urem i8 %x, %y
+ ret i8 %1
+}
+
+define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_noext_ah
+; CHECK: divb [[REG_X:%[a-z0-9]+]]
+; CHECK: movzbl %ah, %eax
+; CHECK: addb [[REG_X]], %al
+; CHECK: ret
+ %1 = urem i8 %x, %y
+ %2 = add i8 %1, %y
+ ret i8 %2
+}
+
+define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext64_ah
+; CHECK: divb
+; CHECK: movzbl %ah, %eax
+; CHECK-32: xorl %edx, %edx
+; CHECK: ret
+ %1 = urem i8 %x, %y
+ %2 = zext i8 %1 to i64
+ ret i64 %2
+}
+
+define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_sdivrem_sext_ah
+; CHECK: cbtw
+; CHECK: idivb
+; CHECK: movsbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK: movb %al, ([[REG_ZPTR]])
+; CHECK: movl [[REG_REM]], %eax
+; CHECK: ret
+ %div = sdiv i8 %x, %y
+ store i8 %div, i8* @z
+ %1 = srem i8 %x, %y
+ ret i8 %1
+}
+
+define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext_ah
+; CHECK: cbtw
+; CHECK: idivb
+; CHECK: movsbl %ah, %eax
+; CHECK: ret
+ %1 = srem i8 %x, %y
+ ret i8 %1
+}
+
+define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_noext_ah
+; CHECK: cbtw
+; CHECK: idivb [[REG_X:%[a-z0-9]+]]
+; CHECK: movsbl %ah, %eax
+; CHECK: addb [[REG_X]], %al
+; CHECK: ret
+ %1 = srem i8 %x, %y
+ %2 = add i8 %1, %y
+ ret i8 %2
+}
+
+define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext64_ah
+; CHECK: cbtw
+; CHECK: idivb
+; CHECK: movsbl %ah, %eax
+; CHECK-32: movl %eax, %edx
+; CHECK-32: sarl $31, %edx
+; CHECK-64: movsbq %al, %rax
+; CHECK: ret
+ %1 = srem i8 %x, %y
+ %2 = sext i8 %1 to i64
+ ret i64 %2
+}
+
+@z = external global i8
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 0d5afa1..c673f5d 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -70,7 +70,7 @@ define weak_odr dllexport void @weak1() {
; CHECK: .weak weak_alias
; CHECK: weak_alias = f1
-@weak_alias = dllexport alias weak_odr void()* @f1
+@weak_alias = weak_odr dllexport alias void()* @f1
@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
@blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index e2c3f13..5035aa1 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -89,7 +89,7 @@ define weak_odr dllexport void @weak1() {
; CHECK: .weak _weak_alias
; CHECK: _weak_alias = _f1
-@weak_alias = dllexport alias weak_odr void()* @f1
+@weak_alias = weak_odr dllexport alias void()* @f1
; CHECK: .section .drectve
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
index 666409f..839bca4 100644
--- a/test/CodeGen/X86/dllimport-x86_64.ll
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -4,7 +4,7 @@
; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
; PR6275
;
-; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+; RUN: opt -mtriple x86_64-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
@Var1 = external dllimport global i32
@Var2 = available_externally dllimport unnamed_addr constant i32 1
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
index 695bfce..231ad65 100644
--- a/test/CodeGen/X86/dllimport.ll
+++ b/test/CodeGen/X86/dllimport.ll
@@ -4,7 +4,7 @@
; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
; PR6275
;
-; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+; RUN: opt -mtriple i386-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
@Var1 = external dllimport global i32
@Var2 = available_externally dllimport unnamed_addr constant i32 1
diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
new file mode 100644
index 0000000..24d9533
--- /dev/null
+++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+
+; CHECK-LABEL: @bar
+; CHECK: movl $1074339512,
+; CHECK: movl $1374389535,
+; CHECK: movl $1078523331,
+define void @bar() unnamed_addr {
+entry-block:
+ %a = alloca double
+ %b = alloca float
+
+ store double 3.140000e+00, double* %a
+ %0 = load double* %a
+
+ %1 = fptrunc double %0 to float
+
+ store float %1, float* %b
+
+ ret void
+}
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index c8d7527..872f7fa 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,15 +7,15 @@ target triple = "x86_64-unknown-linux-gnu"
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!5}
-!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !7, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ]
!2 = metadata !{}
-!3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
!4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
-!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !4, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
!7 = metadata !{metadata !6}
; The important part of the following check is that dir = #0.
; Dir Mod Time File Len File Name
; ---- ---------- ---------- ---------------------------
; CHECK: file_names[ 1] 0 0x00000000 0x00000000 empty.c
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
new file mode 100644
index 0000000..f019bed
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
@@ -0,0 +1,44 @@
+; RUN: llc -no-stack-coloring=false < %s | FileCheck %s
+
+; This test crashed in PEI because the stack protector was dead.
+; This was due to it being colored, which was in turn due to incorrect
+; lifetimes being applied to the stack protector frame index.
+
+; CHECK: stack_chk_guard
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.10.0"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: ssp
+define void @foo(i1 %cond1, i1 %cond2) #1 {
+entry:
+ %bitmapBuffer = alloca [8192 x i8], align 1
+ br i1 %cond1, label %end1, label %bb1
+
+bb1:
+ %bitmapBuffer229 = alloca [8192 x i8], align 1
+ br i1 %cond2, label %end1, label %if.else130
+
+end1:
+ ret void
+
+if.else130: ; preds = %bb1
+ %tmp = getelementptr inbounds [8192 x i8]* %bitmapBuffer, i32 0, i32 0
+ call void @llvm.lifetime.start(i64 8192, i8* %tmp) #0
+ call void @llvm.lifetime.end(i64 8192, i8* %tmp) #0
+ %tmp25 = getelementptr inbounds [8192 x i8]* %bitmapBuffer229, i32 0, i32 0
+ call void @llvm.lifetime.start(i64 8192, i8* %tmp25) #0
+ call void @llvm.lifetime.end(i64 8192, i8* %tmp25) #0
+ br label %end1
+}
+
+declare void @bar()
+
+attributes #0 = { nounwind }
+attributes #1 = { ssp } \ No newline at end of file
diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index ac5174d..4234968 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll
@@ -1,10 +1,14 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
define void @func() {
entry:
unreachable
}
+
+; MachO cannot handle an empty function.
; CHECK-NO-FP: _func:
; CHECK-NO-FP-NEXT: .cfi_startproc
; CHECK-NO-FP: nop
@@ -21,5 +25,30 @@ entry:
; CHECK-FP-NEXT: movq %rsp, %rbp
; CHECK-FP-NEXT: :
; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-FP-NEXT: nop
; CHECK-FP-NEXT: .cfi_endproc
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}#
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}#
+; LINUX-FP-NEXT: pushq %rbp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_offset 16
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_offset %rbp, -16
+; LINUX-FP-NEXT: movq %rsp, %rbp
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index a18f751..ab92fe0 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -93,10 +93,11 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
-; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
-; vandps and there is nothing more you can do to match vmaxpd.
-; CHECK: vmovlhps
-; CHECK: vandps
+; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
+; vpand and there is nothing more you can do to match vmaxpd.
+; CHECK: vmovq
+; CHECK: vpbroadcastq
+; CHECK: vpand
; CHECK: vmaxpd
; CHECK: ret
define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index cadc0fb..8647599 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
define i32 @t(<2 x i64>* %val) nounwind {
; CHECK-LABEL: t:
; CHECK-NOT: movd
@@ -23,3 +25,40 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
%y = extractelement <8 x i32> %Shuff68, i32 0
ret i32 %y
}
+
+; This case could easily end up inf-looping in the DAG combiner due to an
+; low alignment load of the vector which prevents us from reliably forming a
+; narrow load.
+; FIXME: It would be nice to detect whether the target has fast and legal
+; unaligned loads and use them here.
+define void @t3() {
+; CHECK-LABEL: t3:
+;
+; This movs the entire vector, shuffling the high double down. If we fixed the
+; FIXME above it would just move the high double directly.
+; CHECK: movupd
+; CHECK: shufpd
+; CHECK: movlpd
+
+bb:
+ %tmp13 = load <2 x double>* undef, align 1
+ %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
+ store double %.sroa.3.24.vec.extract, double* undef, align 8
+ unreachable
+}
+
+; Case where a load is unary shuffled, then bitcast (to a type with the same
+; number of elements) before extractelement.
+; This is testing for an assertion - the extraction was assuming that the undef
+; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
+define i64 @t4(<2 x double>* %a) {
+; CHECK-LABEL: t4:
+; CHECK: mov
+; CHECK: ret
+ %b = load <2 x double>* %a, align 16
+ %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
+ %d = bitcast <2 x double> %c to <2 x i64>
+ %e = extractelement <2 x i64> %d, i32 1
+ ret i64 %e
+}
+
diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll
index 7467edd..7e783d2 100644
--- a/test/CodeGen/X86/fast-isel-args-fail.ll
+++ b/test/CodeGen/X86/fast-isel-args-fail.ll
@@ -1,7 +1,6 @@
; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10
; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32
; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64
-; REQUIRES: asserts
; Previously, this would cause an assert.
define i31 @t1(i31 %a, i31 %b, i31 %c) {
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
index a3f6851..0df782d 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -351,7 +351,7 @@ bb1:
define i32 @icmp_eq(i32 %x) {
; CHECK-LABEL: icmp_eq
; CHECK-NOT: cmpl
-; CHECK: movl $0, %eax
+; CHECK: xorl %eax, %eax
%1 = icmp eq i32 %x, %x
br i1 %1, label %bb1, label %bb2
bb2:
@@ -387,7 +387,7 @@ bb1:
define i32 @icmp_uge(i32 %x) {
; CHECK-LABEL: icmp_uge
; CHECK-NOT: cmpl
-; CHECK: movl $0, %eax
+; CHECK: xorl %eax, %eax
%1 = icmp uge i32 %x, %x
br i1 %1, label %bb1, label %bb2
bb2:
@@ -411,7 +411,7 @@ bb1:
define i32 @icmp_ule(i32 %x) {
; CHECK-LABEL: icmp_ule
; CHECK-NOT: cmpl
-; CHECK: movl $0, %eax
+; CHECK: xorl %eax, %eax
%1 = icmp ule i32 %x, %x
br i1 %1, label %bb1, label %bb2
bb2:
@@ -435,7 +435,7 @@ bb1:
define i32 @icmp_sge(i32 %x) {
; CHECK-LABEL: icmp_sge
; CHECK-NOT: cmpl
-; CHECK: movl $0, %eax
+; CHECK: xorl %eax, %eax
%1 = icmp sge i32 %x, %x
br i1 %1, label %bb1, label %bb2
bb2:
@@ -459,7 +459,7 @@ bb1:
define i32 @icmp_sle(i32 %x) {
; CHECK-LABEL: icmp_sle
; CHECK-NOT: cmpl
-; CHECK: movl $0, %eax
+; CHECK: xorl %eax, %eax
%1 = icmp sle i32 %x, %x
br i1 %1, label %bb1, label %bb2
bb2:
diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index bbbaeb2..4e6f7c0 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll
@@ -1,19 +1,23 @@
-; RUN: llc < %s -fast-isel | FileCheck %s
-; CHECK: LCPI0_0(%rip)
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large < %s | FileCheck %s --check-prefix=LARGE
-; Make sure fast isel uses rip-relative addressing when required.
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-apple-darwin9.0"
+; Make sure fast isel uses rip-relative addressing for the small code model.
+define float @constpool_float(float %x) {
+; CHECK-LABEL: constpool_float
+; CHECK: LCPI0_0(%rip)
-define i32 @f0(double %x) nounwind {
-entry:
- %retval = alloca i32 ; <i32*> [#uses=2]
- %x.addr = alloca double ; <double*> [#uses=2]
- store double %x, double* %x.addr
- %tmp = load double* %x.addr ; <double> [#uses=1]
- %cmp = fcmp olt double %tmp, 8.500000e-01 ; <i1> [#uses=1]
- %conv = zext i1 %cmp to i32 ; <i32> [#uses=1]
- store i32 %conv, i32* %retval
- %0 = load i32* %retval ; <i32> [#uses=1]
- ret i32 %0
+; LARGE-LABEL: constpool_float
+; LARGE: movabsq $LCPI0_0, %rax
+ %1 = fadd float %x, 16.50e+01
+ ret float %1
+}
+
+define double @constpool_double(double %x) nounwind {
+; CHECK-LABEL: constpool_double
+; CHECK: LCPI1_0(%rip)
+
+; LARGE-LABEL: constpool_double
+; LARGE: movabsq $LCPI1_0, %rax
+ %1 = fadd double %x, 8.500000e-01
+ ret double %1
}
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index cd2dc1d..eca1ae9 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -36,11 +36,11 @@ entry:
store i32 (...)** getelementptr ([4 x i32 (...)*]* @LotsStuff, i32 0, i32 2), i32 (...)*** null, align 4
ret void
; CHECK: _t:
-; CHECK: movl $0, %eax
+; CHECK: xorl %eax, %eax
; CHECK: movl L_LotsStuff$non_lazy_ptr, %ecx
; ATOM: _t:
; ATOM: movl L_LotsStuff$non_lazy_ptr, %e{{..}}
-; ATOM: movl $0, %e{{..}}
+; ATOM: xorl %e{{..}}, %e{{..}}
}
diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll
index f71abd2..686df43 100644
--- a/test/CodeGen/X86/fast-isel-tls.ll
+++ b/test/CodeGen/X86/fast-isel-tls.ll
@@ -13,7 +13,7 @@ entry:
; CHECK: leal v@TLSGD
; CHECK: __tls_get_addr
-@alias = alias internal i32* @v
+@alias = internal alias i32* @v
define i32 @f_alias() nounwind {
entry:
%t = load i32* @v
diff --git a/test/CodeGen/X86/fast-isel-x32.ll b/test/CodeGen/X86/fast-isel-x32.ll
new file mode 100644
index 0000000..d49a108
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-x32.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s
+
+; Test that alloca addresses are materialized with the right size instruction.
+
+declare void @bar(i32* %arg)
+
+; CHECK-LABEL: @foo
+define void @foo() {
+ %a = alloca i32
+; CHECK: leal {{.*}}, %edi
+ call void @bar(i32* %a)
+ ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index f7d2750..3747d04 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -144,7 +144,7 @@ if.end: ; preds = %if.then, %entry
; CHECK-LABEL: test12:
; CHECK: testb $1,
; CHECK-NEXT: je L
-; CHECK-NEXT: movl $0, %edi
+; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: callq
}
@@ -154,7 +154,7 @@ define void @test13() nounwind {
call void @test13f(i1 0)
ret void
; CHECK-LABEL: test13:
-; CHECK: movl $0, %edi
+; CHECK: xorl %edi, %edi
; CHECK-NEXT: callq
}
@@ -194,12 +194,10 @@ define void @test16() nounwind {
br label %block2
block2:
-; CHECK: movabsq $1
-; CHECK: cvtsi2sdq {{.*}} %xmm0
+; CHECK: movsd LCP{{.*}}_{{.*}}(%rip), %xmm0
; CHECK: movb $1, %al
; CHECK: callq _test16callee
-; AVX: movabsq $1
; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
; AVX: movb $1, %al
; AVX: callq _test16callee
@@ -280,7 +278,7 @@ entry:
call void @foo22(i32 3)
ret void
; CHECK-LABEL: test22:
-; CHECK: movl $0, %edi
+; CHECK: xorl %edi, %edi
; CHECK: callq _foo22
; CHECK: movl $1, %edi
; CHECK: callq _foo22
@@ -304,3 +302,13 @@ define void @test23(i8* noalias sret %result) {
}
declare i8* @foo23()
+
+declare void @takesi32ptr(i32* %arg)
+
+; CHECK-LABEL: allocamaterialize
+define void @allocamaterialize() {
+ %a = alloca i32
+; CHECK: leaq {{.*}}, %rdi
+ call void @takesi32ptr(i32* %a)
+ ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index a212a7c..61e9b98 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -60,3 +60,21 @@ entry:
; CHECK: addl $28
}
declare fastcc void @test4fastccsret(%struct.a* sret)
+
+
+; Check that fast-isel cleans up when it fails to lower a call instruction.
+define void @test5() {
+entry:
+ %call = call i32 @test5dllimport(i32 42)
+ ret void
+; CHECK-LABEL: test5:
+; Local value area is still there:
+; CHECK: movl $42, {{%[a-z]+}}
+; Fast-ISel's arg push is not here:
+; CHECK-NOT: movl $42, (%esp)
+; SDag-ISel's arg push:
+; CHECK: movl %esp, [[REGISTER:%[a-z]+]]
+; CHECK: movl $42, ([[REGISTER]])
+; CHECK: movl __imp__test5dllimport
+}
+declare dllimport i32 @test5dllimport(i32)
diff --git a/test/CodeGen/X86/fastmath-optnone.ll b/test/CodeGen/X86/fastmath-optnone.ll
new file mode 100644
index 0000000..0caadff
--- /dev/null
+++ b/test/CodeGen/X86/fastmath-optnone.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2 | FileCheck %s
+; Verify that floating-point operations inside 'optnone' functions
+; are not optimized even if unsafe-fp-math is set.
+
+define float @foo(float %x) #0 {
+entry:
+ %add = fadd fast float %x, %x
+ %add1 = fadd fast float %add, %x
+ ret float %add1
+}
+
+; CHECK-LABEL: @foo
+; CHECK-NOT: add
+; CHECK: mul
+; CHECK-NOT: add
+; CHECK: ret
+
+define float @fooWithOptnone(float %x) #1 {
+entry:
+ %add = fadd fast float %x, %x
+ %add1 = fadd fast float %add, %x
+ ret float %add1
+}
+
+; CHECK-LABEL: @fooWithOptnone
+; CHECK-NOT: mul
+; CHECK: add
+; CHECK-NOT: mul
+; CHECK: add
+; CHECK-NOT: mul
+; CHECK: ret
+
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll
index 494cb28..aadd731 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86_64.ll
@@ -1,316 +1,278 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
; VFMADD
define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddss
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
- ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
- ; CHECK: vfmaddss (%{{.*}})
- %x = load float *%a2
- %y = insertelement <4 x float> undef, float %x, i32 0
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
- ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
- %x = load float *%a1
- %y = insertelement <4 x float> undef, float %x, i32 0
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmaddss
+ ; CHECK-FMA: vfmadd213ss
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddsd
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
- ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
- ; CHECK: vfmaddsd (%{{.*}})
- %x = load double *%a2
- %y = insertelement <2 x double> undef, double %x, i32 0
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
- ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
- %x = load double *%a1
- %y = insertelement <2 x double> undef, double %x, i32 0
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmaddsd
+ ; CHECK-FMA: vfmadd213sd
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddps
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
- ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
- ; CHECK: vfmaddps (%{{.*}})
- %x = load <4 x float>* %a2
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
- ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
- %x = load <4 x float>* %a1
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmaddps
+ ; CHECK-FMA: vfmadd213ps
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-; To test execution dependency
-define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
- ; CHECK: vmovaps
- ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
- %x = load <4 x float>* %a0
- %y = load <4 x float>* %a1
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
- ret < 4 x float > %res
-}
-
define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddpd
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
- ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
- ; CHECK: vfmaddpd (%{{.*}})
- %x = load <2 x double>* %a2
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
- ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
- %x = load <2 x double>* %a1
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmaddpd
+ ; CHECK-FMA: vfmadd213pd
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-; To test execution dependency
-define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
- ; CHECK: vmovapd
- ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
- %x = load <2 x double>* %a0
- %y = load <2 x double>* %a1
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
- ret < 2 x double > %res
-}
-
define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK: vfmaddps
+ ; CHECK-FMA4: vfmaddps
+ ; CHECK-FMA: vfmadd213ps
; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
ret < 8 x float > %res
}
declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK: vfmaddpd
+ ; CHECK-FMA4: vfmaddpd
+ ; CHECK-FMA: vfmadd213pd
; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
ret < 4 x double > %res
}
declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
; VFMSUB
define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfmsubss
- %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmsubss
+ ; CHECK-FMA: vfmsub213ss
+ %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfmsubsd
- %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmsubsd
+ ; CHECK-FMA: vfmsub213sd
+ %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfmsubps
- %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmsubps
+ ; CHECK-FMA: vfmsub213ps
+ %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfmsubpd
- %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmsubpd
+ ; CHECK-FMA: vfmsub213pd
+ %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK: vfmsubps
+ ; CHECK-FMA4: vfmsubps
+ ; CHECK-FMA: vfmsub213ps
; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
ret < 8 x float > %res
}
declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK: vfmsubpd
+ ; CHECK-FMA4: vfmsubpd
+ ; CHECK-FMA: vfmsub213pd
; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
ret < 4 x double > %res
}
declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
; VFNMADD
define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfnmaddss
- %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmaddss
+ ; CHECK-FMA: vfnmadd213ss
+ %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfnmaddsd
- %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmaddsd
+ ; CHECK-FMA: vfnmadd213sd
+ %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfnmaddps
- %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmaddps
+ ; CHECK-FMA: vfnmadd213ps
+ %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfnmaddpd
- %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmaddpd
+ ; CHECK-FMA: vfnmadd213pd
+ %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK: vfnmaddps
+ ; CHECK-FMA4: vfnmaddps
+ ; CHECK-FMA: vfnmadd213ps
; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
ret < 8 x float > %res
}
declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK: vfnmaddpd
+ ; CHECK-FMA4: vfnmaddpd
+ ; CHECK-FMA: vfnmadd213pd
; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
ret < 4 x double > %res
}
declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
; VFNMSUB
define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfnmsubss
- %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmsubss
+ ; CHECK-FMA: vfnmsub213ss
+ %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfnmsubsd
- %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmsubsd
+ ; CHECK-FMA: vfnmsub213sd
+ %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfnmsubps
- %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmsubps
+ ; CHECK-FMA: vfnmsub213ps
+ %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfnmsubpd
- %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfnmsubpd
+ ; CHECK-FMA: vfnmsub213pd
+ %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK: vfnmsubps
+ ; CHECK-FMA4: vfnmsubps
+ ; CHECK-FMA: vfnmsub213ps
; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
ret < 8 x float > %res
}
declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK: vfnmsubpd
+ ; CHECK-FMA4: vfnmsubpd
+ ; CHECK-FMA: vfnmsub213pd
; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
ret < 4 x double > %res
}
declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
; VFMADDSUB
define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddsubps
- %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmaddsubps
+ ; CHECK-FMA: vfmaddsub213ps
+ %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddsubpd
- %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmaddsubpd
+ ; CHECK-FMA: vfmaddsub213pd
+ %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK: vfmaddsubps
+ ; CHECK-FMA4: vfmaddsubps
+ ; CHECK-FMA: vfmaddsub213ps
; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
ret < 8 x float > %res
}
declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK: vfmaddsubpd
+ ; CHECK-FMA4: vfmaddsubpd
+ ; CHECK-FMA: vfmaddsub213pd
; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
ret < 4 x double > %res
}
declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
; VFMSUBADD
define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK: vfmsubaddps
- %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmsubaddps
+ ; CHECK-FMA: vfmsubadd213ps
+ %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
ret < 4 x float > %res
}
declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK: vfmsubaddpd
- %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ; CHECK-FMA4: vfmsubaddpd
+ ; CHECK-FMA: vfmsubadd213pd
+ %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
ret < 2 x double > %res
}
declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK: vfmsubaddps
+ ; CHECK-FMA4: vfmsubaddps
+ ; CHECK-FMA: vfmsubadd213ps
; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
ret < 8 x float > %res
}
declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK: vfmsubaddpd
+ ; CHECK-FMA4: vfmsubaddpd
+ ; CHECK-FMA: vfmsubadd213pd
; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
ret < 4 x double > %res
}
declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll
new file mode 100644
index 0000000..9715bc7
--- /dev/null
+++ b/test/CodeGen/X86/fma-phi-213-to-231.ll
@@ -0,0 +1,246 @@
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: fmaddsubpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+
+; CHECK-LABEL: fmaddsubps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
+; CHECK: cmpl {{%.+}}, [[INDREG]]
+; CHECK: jl [[BODYLBL]]
+define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 47252ec..2eb152b 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -43,8 +43,8 @@ entry:
}
; Test FMA3 variant selection
-; CHECK: fma3_select231ssX:
-; CHECK: vfmadd231ss xmm
+; CHECK-FMA-INST: fma3_select231ssX:
+; CHECK-FMA-INST: vfmadd231ss %xmm
define float @fma3_select231ssX(float %x, float %y) #0 {
entry:
br label %while.body
@@ -58,8 +58,8 @@ while.end: ; preds = %while.body, %entry
}
; Test FMA3 variant selection
-; CHECK: fma3_select231pdY:
-; CHECK: vfmadd231pd ymm
+; CHECK-FMA-INST: fma3_select231pdY:
+; CHECK-FMA-INST: vfmadd231pd %ymm
define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 {
entry:
br label %while.body
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
new file mode 100644
index 0000000..64a2068
--- /dev/null
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
+
+; VFMADD
+define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+ ; CHECK: vfmaddss (%{{.*}})
+ %x = load float *%a2
+ %y = insertelement <4 x float> undef, float %x, i32 0
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y)
+ ret < 4 x float > %res
+}
+define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+ ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
+ %x = load float *%a1
+ %y = insertelement <4 x float> undef, float %x, i32 0
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2)
+ ret < 4 x float > %res
+}
+
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+ ; CHECK: vfmaddsd (%{{.*}})
+ %x = load double *%a2
+ %y = insertelement <2 x double> undef, double %x, i32 0
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y)
+ ret < 2 x double > %res
+}
+define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+ ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
+ %x = load double *%a1
+ %y = insertelement <2 x double> undef, double %x, i32 0
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2)
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
+ ; CHECK: vfmaddps (%{{.*}})
+ %x = load <4 x float>* %a2
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x)
+ ret < 4 x float > %res
+}
+define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+ ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+ %x = load <4 x float>* %a1
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2)
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+; To test execution dependency
+define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+ ; CHECK: vmovaps
+ ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+ %x = load <4 x float>* %a0
+ %y = load <4 x float>* %a1
+ %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2)
+ ret < 4 x float > %res
+}
+
+define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
+ ; CHECK: vfmaddpd (%{{.*}})
+ %x = load <2 x double>* %a2
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x)
+ ret < 2 x double > %res
+}
+define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+ ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+ %x = load <2 x double>* %a1
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2)
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+; To test execution dependency
+define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+ ; CHECK: vmovapd
+ ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+ %x = load <2 x double>* %a0
+ %y = load <2 x double>* %a1
+ %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2)
+ ret < 2 x double > %res
+}
+
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index cfb598d..9b52db9 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -184,7 +184,7 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
; CHECK: test_x86_fmadd_ps_load
; CHECK: vmovaps (%rdi), %xmm2
-; CHECK: vfmadd213ps %xmm1, %xmm0, %xmm2
+; CHECK: vfmadd213ps %xmm1, %xmm2, %xmm0
; CHECK: ret
; CHECK_FMA4: test_x86_fmadd_ps_load
; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
@@ -198,7 +198,7 @@ define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4
; CHECK: test_x86_fmsub_ps_load
; CHECK: vmovaps (%rdi), %xmm2
-; CHECK: fmsub213ps %xmm1, %xmm0, %xmm2
+; CHECK: fmsub213ps %xmm1, %xmm2, %xmm0
; CHECK: ret
; CHECK_FMA4: test_x86_fmsub_ps_load
; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
new file mode 100644
index 0000000..23678c4
--- /dev/null
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=x86 -mtriple=i386-linux-gnu < %s | FileCheck %s
+
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
+
+; CHECK-LABEL: @test_fmaxf
+; CHECK: calll fmaxf
+define float @test_fmaxf(float %x, float %y) {
+ %z = call float @fmaxf(float %x, float %y) readnone
+ ret float %z
+}
+
+; CHECK-LABEL: @test_fmax
+; CHECK: calll fmax
+define double @test_fmax(double %x, double %y) {
+ %z = call double @fmax(double %x, double %y) readnone
+ ret double %z
+}
+
+; CHECK-LABEL: @test_fmaxl
+; CHECK: calll fmaxl
+define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
+ %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
+ ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf
+; CHECK: calll fmaxf
+define float @test_intrinsic_fmaxf(float %x, float %y) {
+ %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
+ ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax
+; CHECK: calll fmax
+define double @test_intrinsic_fmax(double %x, double %y) {
+ %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
+ ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxl
+; CHECK: calll fmaxl
+define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
+ %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
+ ret x86_fp80 %z
+}
diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll
new file mode 100644
index 0000000..1e33cf4
--- /dev/null
+++ b/test/CodeGen/X86/fminnum.ll
@@ -0,0 +1,95 @@
+; RUN: llc -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s
+
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare x86_fp80 @fminl(x86_fp80, x86_fp80)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80)
+
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @test_fminf
+; CHECK: jmp fminf
+define float @test_fminf(float %x, float %y) {
+ %z = call float @fminf(float %x, float %y) readnone
+ ret float %z
+}
+
+; CHECK-LABEL: @test_fmin
+; CHECK: jmp fmin
+define double @test_fmin(double %x, double %y) {
+ %z = call double @fmin(double %x, double %y) readnone
+ ret double %z
+}
+
+; CHECK-LABEL: @test_fminl
+; CHECK: calll fminl
+define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
+ %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone
+ ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminf
+; CHECK: jmp fminf
+define float @test_intrinsic_fminf(float %x, float %y) {
+ %z = call float @llvm.minnum.f32(float %x, float %y) readnone
+ ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin
+; CHECK: jmp fmin
+define double @test_intrinsic_fmin(double %x, double %y) {
+ %z = call double @llvm.minnum.f64(double %x, double %y) readnone
+ ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminl
+; CHECK: calll fminl
+define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) {
+ %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
+ ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v2f32
+; CHECK: calll fminf
+; CHECK: calll fminf
+define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) {
+ %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+ ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v4f32
+; CHECK: calll fminf
+; CHECK: calll fminf
+; CHECK: calll fminf
+; CHECK: calll fminf
+define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
+ %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+ ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v2f64
+; CHECK: calll fmin
+; CHECK: calll fmin
+define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) {
+ %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
+ ret <2 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v8f64
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) {
+ %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
+ ret <8 x double> %z
+}
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
new file mode 100644
index 0000000..7036511
--- /dev/null
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -0,0 +1,147 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 < %s | FileCheck %s
+
+; CHECK-LABEL: fmul2_f32:
+; CHECK: addss %xmm0, %xmm0
+define float @fmul2_f32(float %x) {
+ %y = fmul float %x, 2.0
+ ret float %y
+}
+
+; fmul 2.0, x -> fadd x, x for vectors.
+
+; CHECK-LABEL: fmul2_v4f32:
+; CHECK: addps %xmm0, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul2_v4f32(<4 x float> %x) {
+ %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+ ret <4 x float> %y
+}
+
+; CHECK-LABEL: constant_fold_fmul_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: ret
+define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
+ %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0>
+ ret <4 x float> %y
+}
+
+; CHECK-LABEL: fmul0_v4f32:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul0_v4f32(<4 x float> %x) #0 {
+ %y = fmul <4 x float> %x, <float 0.0, float 0.0, float 0.0, float 0.0>
+ ret <4 x float> %y
+}
+
+; CHECK-LABEL: fmul_c2_c4_v4f32:
+; CHECK-NOT: addps
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_c2_c4_v4f32(<4 x float> %x) #0 {
+ %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+ %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0>
+ ret <4 x float> %z
+}
+
+; CHECK-LABEL: fmul_c3_c4_v4f32:
+; CHECK-NOT: addps
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
+ %y = fmul <4 x float> %x, <float 3.0, float 3.0, float 3.0, float 3.0>
+ %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0>
+ ret <4 x float> %z
+}
+
+; We should be able to pre-multiply the two constant vectors.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
+ %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+ %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+ ret <4 x float> %z
+}
+
+; Same as above, but reverse operands to make sure non-canonical form is also handled.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 {
+ %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+ %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y
+ ret <4 x float> %z
+}
+
+; More than one use of a constant multiply should not inhibit the optimization.
+; Instead of a chain of 2 dependent mults, this test will have 2 independent mults.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
+; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
+; CHECK: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 {
+ %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+ %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+ %a = fadd <4 x float> %y, %z
+ ret <4 x float> %a
+}
+
+; CHECK-LABEL: fmul_c2_c4_f32:
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: mulss
+; CHECK-NEXT: ret
+define float @fmul_c2_c4_f32(float %x) #0 {
+ %y = fmul float %x, 2.0
+ %z = fmul float %y, 4.0
+ ret float %z
+}
+
+; CHECK-LABEL: fmul_c3_c4_f32:
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: mulss
+; CHECK-NET: ret
+define float @fmul_c3_c4_f32(float %x) #0 {
+ %y = fmul float %x, 3.0
+ %z = fmul float %y, 4.0
+ ret float %z
+}
+
+; CHECK-LABEL: fmul_fneg_fneg_f32:
+; CHECK: mulss %xmm1, %xmm0
+; CHECK-NEXT: retq
+define float @fmul_fneg_fneg_f32(float %x, float %y) {
+ %x.neg = fsub float -0.0, %x
+ %y.neg = fsub float -0.0, %y
+ %mul = fmul float %x.neg, %y.neg
+ ret float %mul
+}
+; CHECK-LABEL: fmul_fneg_fneg_v4f32:
+; CHECK: mulps {{%xmm1|\(%rdx\)}}, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul_fneg_fneg_v4f32(<4 x float> %x, <4 x float> %y) {
+ %x.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %x
+ %y.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %y
+ %mul = fmul <4 x float> %x.neg, %y.neg
+ ret <4 x float> %mul
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fnabs.ll b/test/CodeGen/X86/fnabs.ll
new file mode 100644
index 0000000..19718d3
--- /dev/null
+++ b/test/CodeGen/X86/fnabs.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx| FileCheck %s
+
+; Verify that we generate a single OR instruction for a scalar, vec128, and vec256
+; FNABS(x) operation -> FNEG (FABS(x)).
+; If the FABS() result isn't used, the AND instruction should be eliminated.
+; PR20578: http://llvm.org/bugs/show_bug.cgi?id=20578
+
+define float @scalar_no_abs(float %a) {
+; CHECK-LABEL: scalar_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+ %fabs = tail call float @fabsf(float %a) #1
+ %fsub = fsub float -0.0, %fabs
+ ret float %fsub
+}
+
+define float @scalar_uses_abs(float %a) {
+; CHECK-LABEL: scalar_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulss
+; CHECK-NEXT: retq
+ %fabs = tail call float @fabsf(float %a) #1
+ %fsub = fsub float -0.0, %fabs
+ %fmul = fmul float %fsub, %fabs
+ ret float %fmul
+}
+
+define <4 x float> @vector128_no_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+ %fabs = tail call <4 x float> @llvm.fabs.v4f32(< 4 x float> %a) #1
+ %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+ ret <4 x float> %fsub
+}
+
+define <4 x float> @vector128_uses_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+ %fabs = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #1
+ %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+ %fmul = fmul <4 x float> %fsub, %fabs
+ ret <4 x float> %fmul
+}
+
+define <8 x float> @vector256_no_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+ %fabs = tail call <8 x float> @llvm.fabs.v8f32(< 8 x float> %a) #1
+ %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+ ret <8 x float> %fsub
+}
+
+define <8 x float> @vector256_uses_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+ %fabs = tail call <8 x float> @llvm.fabs.v8f32(<8 x float> %a) #1
+ %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+ %fmul = fmul <8 x float> %fsub, %fabs
+ ret <8 x float> %fmul
+}
+
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+declare float @fabsf(float)
+
+attributes #1 = { readnone }
+
diff --git a/test/CodeGen/X86/fold-pcmpeqd-0.ll b/test/CodeGen/X86/fold-pcmpeqd-0.ll
deleted file mode 100644
index 1d315ff..0000000
--- a/test/CodeGen/X86/fold-pcmpeqd-0.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
-; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
-
-; i386 test has been disabled when scheduler 2-addr hack is disabled.
-
-; This testcase shouldn't need to spill the -1 value,
-; so it should just use pcmpeqd to materialize an all-ones vector.
-; For i386, cp load of -1 are folded.
-
-; With -regalloc=greedy, the live range is split before spilling, so the first
-; pcmpeq doesn't get folded as a constant pool load.
-
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-
-; X86-64: pcmpeqd
-; X86-64-NOT: pcmpeqd
-
- %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
- %struct._cl_image_format_t = type <{ i32, i32, i32 }>
- %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
-
-define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
-entry:
- %tmp3.i = load i32* null ; <i32> [#uses=1]
- %cmp = icmp sgt i32 %tmp3.i, 200 ; <i1> [#uses=1]
- br i1 %cmp, label %forcond, label %ifthen
-
-ifthen: ; preds = %entry
- ret void
-
-forcond: ; preds = %entry
- %tmp3.i536 = load i32* null ; <i32> [#uses=1]
- %cmp12 = icmp slt i32 0, %tmp3.i536 ; <i1> [#uses=1]
- br i1 %cmp12, label %forbody, label %afterfor
-
-forbody: ; preds = %forcond
- %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
- %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1]
- %mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1]
- %mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1]
- %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
- %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
- %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1]
- %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1]
- %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2]
- %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1]
- %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1]
- %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1]
- %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
- %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1]
- %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1]
- %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1]
- %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1]
- %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1]
- %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1]
- %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1]
- %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0]
- %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1]
- %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
- %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1]
- %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1]
- %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
- %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1]
- %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1]
- %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1]
- %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1]
- %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2]
- %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1]
- %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1]
- %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1]
- %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2]
- %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1]
- %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1]
- %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1]
- %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1]
- %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1]
- %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1]
- call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
- unreachable
-
-afterfor: ; preds = %forcond
- ret void
-}
-
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
new file mode 100644
index 0000000..a643d86
--- /dev/null
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
+; Regression test for http://reviews.llvm.org/D5701
+
+; ModuleID = 'xxhash.i'
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+; CHECK-LABEL: fn1
+; CHECK: shldl {{.*#+}} 4-byte Folded Spill
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: shldl {{.*#+}} 4-byte Folded Spill
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: addl {{.*#+}} 4-byte Folded Reload
+; CHECK: imull {{.*#+}} 4-byte Folded Reload
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: retl
+
+%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
+
+@a = common global i32 0, align 4
+@b = common global i64 0, align 8
+
+; Function Attrs: nounwind uwtable
+define i64 @fn1() #0 {
+entry:
+ %0 = load i32* @a, align 4, !tbaa !1
+ %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
+ %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
+ %2 = load i32* %total_len, align 4, !tbaa !5
+ %tobool = icmp eq i32 %2, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
+ %3 = load i64* %v3, align 4, !tbaa !8
+ %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
+ %4 = load i64* %v4, align 4, !tbaa !9
+ %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
+ %5 = load i64* %v2, align 4, !tbaa !10
+ %shl = shl i64 %5, 1
+ %or = or i64 %shl, %5
+ %shl2 = shl i64 %3, 2
+ %shr = lshr i64 %3, 1
+ %or3 = or i64 %shl2, %shr
+ %add = add i64 %or, %or3
+ %mul = mul i64 %4, -4417276706812531889
+ %shl4 = mul i64 %4, -8834553413625063778
+ %shr5 = ashr i64 %mul, 3
+ %or6 = or i64 %shr5, %shl4
+ %mul7 = mul nsw i64 %or6, 1400714785074694791
+ %xor = xor i64 %add, %mul7
+ store i64 %xor, i64* @b, align 8, !tbaa !11
+ %mul8 = mul nsw i64 %xor, 1400714785074694791
+ br label %if.end
+
+if.else: ; preds = %entry
+ %6 = load i64* @b, align 8, !tbaa !11
+ %xor10 = xor i64 %6, -4417276706812531889
+ %mul11 = mul nsw i64 %xor10, 400714785074694791
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
+ %storemerge = add i64 %storemerge.in, -8796714831421723037
+ store i64 %storemerge, i64* @b, align 8, !tbaa !11
+ ret i64 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !2, i64 0}
+!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
+!7 = metadata !{metadata !"long long", metadata !3, i64 0}
+!8 = metadata !{metadata !6, metadata !7, i64 16}
+!9 = metadata !{metadata !6, metadata !7, i64 24}
+!10 = metadata !{metadata !6, metadata !7, i64 8}
+!11 = metadata !{metadata !7, metadata !7, i64 0}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
index a973bef..e6c1e1a 100644
--- a/test/CodeGen/X86/fp-load-trunc.ll
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -2,57 +2,87 @@
; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
define <1 x float> @test1(<1 x double>* %p) nounwind {
-; CHECK: test1
-; CHECK: cvtsd2ss
-; CHECK: ret
-; AVX: test1
-; AVX: vcvtsd2ss
-; AVX: ret
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movsd (%eax), %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0
+; CHECK-NEXT: movss %xmm0, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test1:
+; AVX: # BB#0:
+; AVX-NEXT: pushl %eax
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmovsd (%eax), %xmm0
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovss %xmm0, (%esp)
+; AVX-NEXT: flds (%esp)
+; AVX-NEXT: popl %eax
+; AVX-NEXT: retl
%x = load <1 x double>* %p
%y = fptrunc <1 x double> %x to <1 x float>
ret <1 x float> %y
}
define <2 x float> @test2(<2 x double>* %p) nounwind {
-; CHECK: test2
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: ret
-; AVX: test2
-; AVX: vcvtpd2psx {{[0-9]*}}(%{{.*}})
-; AVX: ret
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test2:
+; AVX: # BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vcvtpd2psx (%eax), %xmm0
+; AVX-NEXT: retl
%x = load <2 x double>* %p
%y = fptrunc <2 x double> %x to <2 x float>
ret <2 x float> %y
}
define <4 x float> @test3(<4 x double>* %p) nounwind {
-; CHECK: test3
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: ret
-; AVX: test3
-; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}})
-; AVX: ret
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1
+; CHECK-NEXT: cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test3:
+; AVX: # BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vcvtpd2psy (%eax), %xmm0
+; AVX-NEXT: retl
%x = load <4 x double>* %p
%y = fptrunc <4 x double> %x to <4 x float>
ret <4 x float> %y
}
define <8 x float> @test4(<8 x double>* %p) nounwind {
-; CHECK: test4
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: ret
-; AVX: test4
-; AVX: vcvtpd2psy
-; AVX: vcvtpd2psy
-; AVX: vinsertf128
-; AVX: ret
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1
+; CHECK-NEXT: cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: cvtpd2ps 48(%eax), %xmm2
+; CHECK-NEXT: cvtpd2ps 32(%eax), %xmm1
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test4:
+; AVX: # BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vcvtpd2psy (%eax), %xmm0
+; AVX-NEXT: vcvtpd2psy 32(%eax), %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retl
%x = load <8 x double>* %p
%y = fptrunc <8 x double> %x to <8 x float>
ret <8 x float> %y
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 25442fc..6424bfc 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -2,55 +2,77 @@
; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
define <1 x float> @test1(<1 x double> %x) nounwind {
-; CHECK: test1
-; CHECK: cvtsd2ss
-; CHECK: ret
-; AVX: test1
-; AVX: vcvtsd2ss
-; AVX: ret
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: movsd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0
+; CHECK-NEXT: movss %xmm0, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test1:
+; AVX: # BB#0:
+; AVX-NEXT: pushl %eax
+; AVX-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovss %xmm0, (%esp)
+; AVX-NEXT: flds (%esp)
+; AVX-NEXT: popl %eax
+; AVX-NEXT: retl
%y = fptrunc <1 x double> %x to <1 x float>
ret <1 x float> %y
}
define <2 x float> @test2(<2 x double> %x) nounwind {
-; CHECK: test2
-; CHECK: cvtpd2ps
-; CHECK: ret
-; AVX: test2
-; AVX-NOT: vcvtpd2psy
-; AVX: vcvtpd2ps
-; AVX: ret
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test2:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX-NEXT: retl
%y = fptrunc <2 x double> %x to <2 x float>
ret <2 x float> %y
}
define <4 x float> @test3(<4 x double> %x) nounwind {
-; CHECK: test3
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: ret
-; AVX: test3
-; AVX: vcvtpd2psy
-; AVX: ret
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test3:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
%y = fptrunc <4 x double> %x to <4 x float>
ret <4 x float> %y
}
define <8 x float> @test4(<8 x double> %x) nounwind {
-; CHECK: test4
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: ret
-; AVX: test4
-; AVX: vcvtpd2psy
-; AVX: vcvtpd2psy
-; AVX: vinsertf128
-; AVX: ret
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: cvtpd2ps %xmm3, %xmm3
+; CHECK-NEXT: cvtpd2ps %xmm2, %xmm1
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT: retl
+;
+; AVX-LABEL: test4:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT: vcvtpd2psy %ymm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retl
%y = fptrunc <8 x double> %x to <8 x float>
ret <8 x float> %y
}
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
new file mode 100644
index 0000000..dfc59a3
--- /dev/null
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as
+
+@g1 = global double 0.000000e+00, align 8
+@g2 = global i32 0, align 4
+
+define void @_Z16fpuop_arithmeticjj(i32, i32) {
+entry:
+ switch i32 undef, label %sw.bb.i1921 [
+ ]
+
+sw.bb261: ; preds = %entry, %entry
+ unreachable
+
+sw.bb.i1921: ; preds = %if.end504
+ switch i32 undef, label %if.end511 [
+ i32 1, label %sw.bb27.i
+ ]
+
+sw.bb27.i: ; preds = %sw.bb.i1921
+ %conv.i.i1923 = fpext float undef to x86_fp80
+ br label %if.end511
+
+if.end511: ; preds = %sw.bb27.i, %sw.bb13.i
+ %src.sroa.0.0.src.sroa.0.0.2280 = phi x86_fp80 [ %conv.i.i1923, %sw.bb27.i ], [ undef, %sw.bb.i1921 ]
+ switch i32 undef, label %sw.bb992 [
+ i32 3, label %sw.bb735
+ i32 18, label %if.end41.i2210
+ ]
+
+sw.bb735: ; preds = %if.end511
+ %2 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280)
+ unreachable
+
+if.end41.i2210: ; preds = %if.end511
+ call void @llvm.dbg.value(metadata !{x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280}, i64 0, metadata !20, metadata !{metadata !"0x102"})
+ unreachable
+
+sw.bb992: ; preds = %if.end511
+ ret void
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!24, !25}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"fpu_ieee.cpp", metadata !"x87stackifier"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", metadata !5, metadata !6, metadata !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic]
+!5 = metadata !{metadata !"f1.cpp", metadata !"x87stackifier"}
+!6 = metadata !{metadata !"0x29", metadata !5} ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !9}
+!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !18, metadata !20}
+!11 = metadata !{metadata !"0x101\00\0016777227\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
+!12 = metadata !{metadata !"0x101\00\0033554443\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
+!13 = metadata !{metadata !"0x100\00x\0014\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [x] [line 14]
+!14 = metadata !{metadata !"0x16\00fpu_extended\003\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register]
+!15 = metadata !{metadata !"0x16\00fpu_register\002\000\000\000\000", metadata !5, null, metadata !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64]
+!16 = metadata !{metadata !"0x16\00uae_f64\001\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double]
+!17 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!18 = metadata !{metadata !"0x100\00a\0015\000", metadata !4, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 15]
+!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{metadata !"0x100\00value\0016\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [value] [line 16]
+!21 = metadata !{metadata !22, metadata !23}
+!22 = metadata !{metadata !"0x34\00g1\00g1\00\005\000\001", null, metadata !6, metadata !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def]
+!23 = metadata !{metadata !"0x34\00g2\00g2\00\006\000\001", null, metadata !6, metadata !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def]
+!24 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!25 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
index 6c1ca25..452c8e5 100644
--- a/test/CodeGen/X86/frameaddr.ll
+++ b/test/CodeGen/X86/frameaddr.ll
@@ -2,6 +2,8 @@
; RUN: llc < %s -march=x86 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32
; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK-64
; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=x86_64-gnux32 | FileCheck %s --check-prefix=CHECK-X32ABI
+; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-X32ABI
define i8* @test1() nounwind {
entry:
@@ -17,6 +19,12 @@ entry:
; CHECK-64-NEXT: movq %rbp, %rax
; CHECK-64-NEXT: pop
; CHECK-64-NEXT: ret
+; CHECK-X32ABI-LABEL: test1
+; CHECK-X32ABI: pushq %rbp
+; CHECK-X32ABI-NEXT: movl %esp, %ebp
+; CHECK-X32ABI-NEXT: movl %ebp, %eax
+; CHECK-X32ABI-NEXT: popq %rbp
+; CHECK-X32ABI-NEXT: ret
%0 = tail call i8* @llvm.frameaddress(i32 0)
ret i8* %0
}
@@ -37,6 +45,13 @@ entry:
; CHECK-64-NEXT: movq (%rax), %rax
; CHECK-64-NEXT: pop
; CHECK-64-NEXT: ret
+; CHECK-X32ABI-LABEL: test2
+; CHECK-X32ABI: pushq %rbp
+; CHECK-X32ABI-NEXT: movl %esp, %ebp
+; CHECK-X32ABI-NEXT: movl (%ebp), %eax
+; CHECK-X32ABI-NEXT: movl (%eax), %eax
+; CHECK-X32ABI-NEXT: popq %rbp
+; CHECK-X32ABI-NEXT: ret
%0 = tail call i8* @llvm.frameaddress(i32 2)
ret i8* %0
}
diff --git a/test/CodeGen/X86/gcc_except_table_functions.ll b/test/CodeGen/X86/gcc_except_table_functions.ll
new file mode 100644
index 0000000..4a81680
--- /dev/null
+++ b/test/CodeGen/X86/gcc_except_table_functions.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+; This test demonstrates that it is possible to use functions for typeinfo
+; instead of global variables. While __gxx_personality_v0 would never know what
+; to do with them, other EH schemes such as SEH might use them.
+
+declare i32 @__gxx_personality_v0(...)
+declare void @filt0()
+declare void @filt1()
+declare void @_Z1fv()
+declare i32 @llvm.eh.typeid.for(i8*)
+
+define i32 @main() uwtable {
+entry:
+ invoke void @_Z1fv()
+ to label %try.cont unwind label %lpad
+
+try.cont:
+ ret i32 0
+
+lpad:
+ %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+ catch i8* bitcast (void ()* @filt0 to i8*)
+ catch i8* bitcast (void ()* @filt1 to i8*)
+ %sel = extractvalue { i8*, i32 } %0, 1
+ %id0 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt0 to i8*))
+ %is_f0 = icmp eq i32 %sel, %id0
+ br i1 %is_f0, label %try.cont, label %check_f1
+
+check_f1:
+ %id1 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt1 to i8*))
+ %is_f1 = icmp eq i32 %sel, %id1
+ br i1 %is_f1, label %try.cont, label %eh.resume
+
+eh.resume:
+ resume { i8*, i32 } %0
+}
+
+; CHECK-LABEL: main:
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 3, __gxx_personality_v0
+; CHECK: .cfi_lsda 3, .Lexception0
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: callq _Z1fv
+; CHECK: retq
+; CHECK: cmpl $2, %edx
+; CHECK: je
+; CHECK: cmpl $1, %edx
+; CHECK: je
+; CHECK: callq _Unwind_Resume
+; CHECK: .cfi_endproc
+; CHECK: GCC_except_table0:
+; CHECK: Lexception0:
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index c763f39..fa1169d 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -53,21 +53,20 @@ define void @F1() {
; _Complex long long const G4 = 34;
-@G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 }
+@G4 = private unnamed_addr constant {i64,i64} { i64 34, i64 0 }
; DARWIN: .section __TEXT,__literal16,16byte_literals
-; DARWIN: _G4:
+; DARWIN: L_G4:
; DARWIN: .long 34
; DARWIN-STATIC: .section __TEXT,__literal16,16byte_literals
-; DARWIN-STATIC: _G4:
+; DARWIN-STATIC: L_G4:
; DARWIN-STATIC: .long 34
; DARWIN64: .section __TEXT,__literal16,16byte_literals
-; DARWIN64: _G4:
+; DARWIN64: L_G4:
; DARWIN64: .quad 34
-
; int G5 = 47;
@G5 = global i32 47
@@ -194,3 +193,23 @@ define void @F1() {
; WIN32-SECTIONS: L_G14:
; WIN32-SECTIONS: .asciz "foo"
+; cannot be merged on MachO, but can on other formats.
+@G15 = unnamed_addr constant i64 0
+
+; LINUX: .section .rodata.cst8,"aM",@progbits,8
+; LINUX: G15:
+
+; DARWIN: .section __TEXT,__const
+; DARWIN: _G15:
+
+; DARWIN-STATIC: .section __TEXT,__const
+; DARWIN-STATIC: _G15:
+
+; DARWIN64: .section __TEXT,__const
+; DARWIN64: _G15:
+
+; LINUX-SECTIONS: .section .rodata.G15,"aM",@progbits,8
+; LINUX-SECTIONS: G15:
+
+; WIN32-SECTIONS: .section .rdata,"rd",one_only,_G15
+; WIN32-SECTIONS: _G15:
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
new file mode 100644
index 0000000..1dcf939
--- /dev/null
+++ b/test/CodeGen/X86/half.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]]
+; CHECK: movw [[TMP]], (%rsi)
+ %val = load half* %in
+ store half %val, half* %out
+ ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: movzwl (%rdi), %eax
+ %val = load half* %addr
+ %val_int = bitcast half %val to i16
+ ret i16 %val_int
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: movw %si, (%rdi)
+ %val_fp = bitcast i16 %in to half
+ store half %val_fp, half* %addr
+ ret void
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+
+; CHECK-LIBCALL: jmp __gnu_h2f_ieee
+; CHECK-FP16: vcvtph2ps
+ %val16 = load half* %addr
+ %val32 = fpext half %val16 to float
+ ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+
+; CHECK-LIBCALL: callq __gnu_h2f_ieee
+; CHECK-LIBCALL: cvtss2sd
+; CHECK-FP16: vcvtph2ps
+; CHECK-FP16: vcvtss2sd
+ %val16 = load half* %addr
+ %val32 = fpext half %val16 to double
+ ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+
+; CHECK-LIBCALL: callq __gnu_f2h_ieee
+; CHECK-FP16: vcvtps2ph
+ %val16 = fptrunc float %in to half
+ store half %val16, half* %addr
+ ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+
+; CHECK-LIBCALL: callq __truncdfhf2
+; CHECK-FP16: callq __truncdfhf2
+ %val16 = fptrunc double %in to half
+ store half %val16, half* %addr
+ ret void
+}
diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll
deleted file mode 100644
index ba846f3..0000000
--- a/test/CodeGen/X86/i8-umulo.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s
-; PR19858
-
-declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
-define i8 @testumulo(i32 %argc) {
-; CHECK: imulw
-; CHECK: testb %{{.+}}, %{{.+}}
-; CHECK: je [[NOOVERFLOWLABEL:.+]]
-; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
-; CHECK-NEXT: movb
-; CHECK-NEXT: retl
-top:
- %RHS = trunc i32 %argc to i8
- %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS)
- %ex = extractvalue { i8, i1 } %umul, 1
- br i1 %ex, label %overflow, label %nooverlow
-
-overflow:
- ret i8 %RHS
-
-nooverlow:
- %umul.value = extractvalue { i8, i1 } %umul, 0
- ret i8 %umul.value
-}
diff --git a/test/CodeGen/X86/inalloca-regparm.ll b/test/CodeGen/X86/inalloca-regparm.ll
new file mode 100644
index 0000000..9dd916b
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-regparm.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s -o /dev/null
+; RUN: not llc -mtriple=x86_64-windows-msvc %s -o /dev/null 2>&1 | FileCheck %s
+
+; This will compile successfully on x86 but not x86_64, because %b will become a
+; register parameter.
+
+declare x86_thiscallcc i32 @f(i32 %a, i32* inalloca %b)
+define void @g() {
+ %b = alloca inalloca i32
+ store i32 2, i32* %b
+ call x86_thiscallcc i32 @f(i32 0, i32* inalloca %b)
+ ret void
+}
+
+; CHECK: cannot use inalloca attribute on a register parameter
diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll
index 91c477b..bb3778a 100644
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -340,3 +340,65 @@ entry:
%0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
ret i32 %0
}
+
+; <rdar://problem/16952634>
+; X87 stackifier asserted when there was an ST register defined by an
+; inline-asm instruction and the ST register was live across another
+; inline-asm instruction.
+;
+; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; %FP0<def> = COPY %ST0
+
+; CHECK-LABEL: _test_live_st
+; CHECK: ## InlineAsm Start
+; CHECK: frndint
+; CHECK: ## InlineAsm End
+; CHECK: ## InlineAsm Start
+; CHECK: fldcw
+; CHECK: ## InlineAsm End
+
+%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, i32, i32, i32 }
+
+@fpu = external global %struct.fpu_t, align 16
+
+; Function Attrs: ssp
+define void @test_live_st(i32 %a1) {
+entry:
+ %0 = load x86_fp80* undef, align 16
+ %cond = icmp eq i32 %a1, 1
+ br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit
+
+sw.bb4.i:
+ %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0)
+ call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef)
+ br label %_Z5tointRKe.exit
+
+_Z5tointRKe.exit:
+ %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ]
+ %conv.i1814 = fptosi x86_fp80 %result.0.i to i32
+ %conv626 = sitofp i32 %conv.i1814 to x86_fp80
+ store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1)
+ br label %return
+
+return:
+ ret void
+}
+
+; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
+;
+; CHECK-LABEL: _test_operand_rewrite
+; CHECK: ## InlineAsm Start
+; CHECK: foo %st(0), %st(1)
+; CHECK: ## InlineAsm End
+
+define double @test_operand_rewrite() {
+entry:
+ %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
+ %asmresult = extractvalue { double, double } %0, 0
+ %asmresult1 = extractvalue { double, double } %0, 1
+ %sub = fsub double %asmresult, %asmresult1
+ ret double %sub
+}
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index d417453..dfa8aed 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -284,7 +284,7 @@ entry:
define i32 @func_test1(i32 %p1) nounwind uwtable {
entry:
; CHECK-LABEL: func_test1:
-; CHECK: testb
+; CHECK: andb
; CHECK: j
; CHECK: ret
%0 = load i32* @b, align 4
diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll
index f3691fd..2062200 100644
--- a/test/CodeGen/X86/jump_table_alias.ll
+++ b/test/CodeGen/X86/jump_table_alias.ll
@@ -5,7 +5,7 @@ entry:
ret i32 0
}
-@i = alias internal i32 ()* @f
+@i = internal alias i32 ()* @f
@j = alias i32 ()* @f
define i32 @main(i32 %argc, i8** %argv) {
@@ -25,7 +25,6 @@ define i32 @main(i32 %argc, i8** %argv) {
; There should only be one table, even though there are two GlobalAliases,
; because they both alias the same value.
-; CHECK: .globl __llvm_jump_instr_table_0_1
; CHECK: .align 8, 0x90
; CHECK: .type __llvm_jump_instr_table_0_1,@function
; CHECK: __llvm_jump_instr_table_0_1:
diff --git a/test/CodeGen/X86/jump_table_align.ll b/test/CodeGen/X86/jump_table_align.ll
new file mode 100644
index 0000000..6ad48d1
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_align.ll
@@ -0,0 +1,29 @@
+; RUN: llc -filetype=obj <%s -jump-table-type=single -o %t1
+; RUN: llvm-objdump -triple=x86_64-unknown-linux-gnu -d %t1 | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+ ret i32 0
+}
+
+define i32 @g(i8* %a) unnamed_addr jumptable {
+ ret i32 0
+}
+
+define void @h(void ()* %func) unnamed_addr jumptable {
+ ret void
+}
+
+define i32 @main() {
+ %g = alloca i32 (...)*, align 8
+ store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
+ %1 = load i32 (...)** %g, align 8
+ %call = call i32 (...)* %1()
+ call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
+ %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
+ ret i32 %a
+}
+
+; Make sure that the padding from getJumpInstrTableEntryBound is right.
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK-NEXT: e9 00 00 00 00 jmp 0
+; CHECK-NEXT: 0f 1f 00 nopl (%rax)
diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll
index 33a798f..749b77a 100644
--- a/test/CodeGen/X86/jump_table_bitcast.ll
+++ b/test/CodeGen/X86/jump_table_bitcast.ll
@@ -15,12 +15,12 @@ define void @h(void ()* %func) unnamed_addr jumptable {
define i32 @main() {
%g = alloca i32 (...)*, align 8
store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
-; CHECK: movq $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp)
-; CHECK: movl $__llvm_jump_instr_table_0_[[ENTRY]], %ecx
+; CHECK: movq $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]],
+; CHECK: movl $__llvm_jump_instr_table_0_[[ENTRY]],
%1 = load i32 (...)** %g, align 8
%call = call i32 (...)* %1()
call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
-; CHECK: movl $__llvm_jump_instr_table_0_{{1|2|3}}, %edi
+; CHECK: movl $__llvm_jump_instr_table_0_{{1|2|3}},
; CHECK: callq h
%a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
@@ -28,17 +28,14 @@ define i32 @main() {
ret i32 %a
}
-; CHECK: .globl __llvm_jump_instr_table_0_1
; CHECK: .align 8, 0x90
; CHECK: .type __llvm_jump_instr_table_0_1,@function
; CHECK: __llvm_jump_instr_table_0_1:
; CHECK: jmp {{f|g|h}}@PLT
-; CHECK: .globl __llvm_jump_instr_table_0_2
; CHECK: .align 8, 0x90
; CHECK: .type __llvm_jump_instr_table_0_2,@function
; CHECK: __llvm_jump_instr_table_0_2:
; CHECK: jmp {{f|g|h}}@PLT
-; CHECK: .globl __llvm_jump_instr_table_0_3
; CHECK: .align 8, 0x90
; CHECK: .type __llvm_jump_instr_table_0_3,@function
; CHECK: __llvm_jump_instr_table_0_3:
diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll
index 5a0aed0..485154e 100644
--- a/test/CodeGen/X86/jump_tables.ll
+++ b/test/CodeGen/X86/jump_tables.ll
@@ -7,6 +7,20 @@ target triple = "x86_64-unknown-linux-gnu"
%struct.fun_struct = type { i32 (...)* }
+@a = global [12 x i32 () *] [ i32 ()* bitcast (void ()* @indirect_fun to i32 ()*),
+ i32 ()* bitcast (void ()* @indirect_fun_match to i32 ()*),
+ i32 ()* bitcast (i32 ()* @indirect_fun_i32 to i32 ()*),
+ i32 ()* bitcast (i32 (i32)* @indirect_fun_i32_1 to i32 ()*),
+ i32 ()* bitcast (i32 (i32, i32)* @indirect_fun_i32_2 to i32 ()*),
+ i32 ()* bitcast (i32* (i32*, i32)* @indirect_fun_i32S_2 to i32 ()*),
+ i32 ()* bitcast (void (%struct.fun_struct)* @indirect_fun_struct to i32 ()*),
+ i32 ()* bitcast (void (i32 (...)*, i32)* @indirect_fun_fun to i32 ()*),
+ i32 ()* bitcast (i32 (i32 (...)*, i32)* @indirect_fun_fun_ret to i32 ()*),
+ i32 ()* bitcast (void ([19 x i8])* @indirect_fun_array to i32 ()*),
+ i32 ()* bitcast (void (<3 x i32>)* @indirect_fun_vec to i32 ()*),
+ i32 ()* bitcast (void (<4 x float>)* @indirect_fun_vec_2 to i32 ()*)
+ ]
+
define void @indirect_fun() unnamed_addr jumptable {
ret void
}
@@ -74,62 +88,50 @@ define i32 @main(i32 %argc, i8** %argv) {
ret i32 %a
}
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_1
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_1,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_1:
; SINGLE-DAG: jmp indirect_fun_array@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_2
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_2,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_2:
; SINGLE-DAG: jmp indirect_fun_i32_2@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_3
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_3,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_3:
; SINGLE-DAG: jmp indirect_fun_vec_2@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_4
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_4,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_4:
; SINGLE-DAG: jmp indirect_fun_i32S_2@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_5
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_5,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_5:
; SINGLE-DAG: jmp indirect_fun_struct@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_6
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_6,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_6:
; SINGLE-DAG: jmp indirect_fun_i32_1@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_7
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_7,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_7:
; SINGLE-DAG: jmp indirect_fun_i32@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_8
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_8,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_8:
; SINGLE-DAG: jmp indirect_fun_fun@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_9
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_9,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_9:
; SINGLE-DAG: jmp indirect_fun_fun_ret@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_10
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_10,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_10:
; SINGLE-DAG: jmp indirect_fun@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_11
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_11,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_11:
; SINGLE-DAG: jmp indirect_fun_match@PLT
-; SINGLE-DAG: .globl __llvm_jump_instr_table_0_12
; SINGLE-DAG: .align 8, 0x90
; SINGLE-DAG: .type __llvm_jump_instr_table_0_12,@function
; SINGLE-DAG: __llvm_jump_instr_table_0_12:
@@ -144,82 +146,69 @@ define i32 @main(i32 %argc, i8** %argv) {
; SINGLE-DAG: ud2
-; ARITY-DAG: .globl __llvm_jump_instr_table_2_1
; ARITY-DAG: .align 8, 0x90
; ARITY-DAG: .type __llvm_jump_instr_table_2_1,@function
; ARITY-DAG: __llvm_jump_instr_table_2_1:
; ARITY-DAG: jmp indirect_fun{{.*}}@PLT
; ARITY-DAG: .align 8, 0x90
; ARITY-DAG: ud2
-; ARITY-DAG: .globl __llvm_jump_instr_table_0_1
; ARITY-DAG: .align 8, 0x90
; ARITY-DAG: .type __llvm_jump_instr_table_0_1,@function
; ARITY-DAG: __llvm_jump_instr_table_0_1:
; ARITY-DAG: jmp indirect_fun{{.*}}@PLT
-; ARITY-DAG: .globl __llvm_jump_instr_table_1_1
; ARITY-DAG: .align 8, 0x90
; ARITY-DAG: .type __llvm_jump_instr_table_1_1,@function
; ARITY-DAG: __llvm_jump_instr_table_1_1:
; ARITY-DAG: jmp indirect_fun{{.*}}@PLT
-; SIMPL-DAG: .globl __llvm_jump_instr_table_2_1
; SIMPL-DAG: .align 8, 0x90
; SIMPL-DAG: .type __llvm_jump_instr_table_2_1,@function
; SIMPL-DAG: __llvm_jump_instr_table_2_1:
; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT
; SIMPL-DAG: .align 8, 0x90
; SIMPL-DAG: ud2
-; SIMPL-DAG: .globl __llvm_jump_instr_table_0_1
; SIMPL-DAG: .align 8, 0x90
; SIMPL-DAG: .type __llvm_jump_instr_table_0_1,@function
; SIMPL-DAG: __llvm_jump_instr_table_0_1:
; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT
-; SIMPL-DAG: .globl __llvm_jump_instr_table_1_1
; SIMPL-DAG: .align 8, 0x90
; SIMPL-DAG: .type __llvm_jump_instr_table_1_1,@function
; SIMPL-DAG: __llvm_jump_instr_table_1_1:
; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT
-; SIMPL-DAG: .globl __llvm_jump_instr_table_3_1
; SIMPL-DAG: .align 8, 0x90
; SIMPL-DAG: .type __llvm_jump_instr_table_3_1,@function
; SIMPL-DAG: __llvm_jump_instr_table_3_1:
; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT
-; SIMPL-DAG: .globl __llvm_jump_instr_table_4_1
; SIMPL-DAG: .align 8, 0x90
; SIMPL-DAG: .type __llvm_jump_instr_table_4_1,@function
; SIMPL-DAG: __llvm_jump_instr_table_4_1:
; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT
-; FULL-DAG: .globl __llvm_jump_instr_table_10_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_10_1,@function
; FULL-DAG:__llvm_jump_instr_table_10_1:
; FULL-DAG: jmp indirect_fun_i32_1@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_9_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_9_1,@function
; FULL-DAG:__llvm_jump_instr_table_9_1:
; FULL-DAG: jmp indirect_fun_i32_2@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_7_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_7_1,@function
; FULL-DAG:__llvm_jump_instr_table_7_1:
; FULL-DAG: jmp indirect_fun_i32S_2@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_3_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_3_1,@function
; FULL-DAG:__llvm_jump_instr_table_3_1:
; FULL-DAG: jmp indirect_fun_vec_2@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_2_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_2_1,@function
; FULL-DAG:__llvm_jump_instr_table_2_1:
@@ -228,42 +217,36 @@ define i32 @main(i32 %argc, i8** %argv) {
; FULL-DAG: ud2
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_8_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_8_1,@function
; FULL-DAG:__llvm_jump_instr_table_8_1:
; FULL-DAG: jmp indirect_fun_i32@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_1_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_1_1,@function
; FULL-DAG:__llvm_jump_instr_table_1_1:
; FULL-DAG: jmp indirect_fun_array@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_0_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_0_1,@function
; FULL-DAG:__llvm_jump_instr_table_0_1:
; FULL-DAG: jmp indirect_fun_vec@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_6_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_6_1,@function
; FULL-DAG:__llvm_jump_instr_table_6_1:
; FULL-DAG: jmp indirect_fun_struct@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_5_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_5_1,@function
; FULL-DAG:__llvm_jump_instr_table_5_1:
; FULL-DAG: jmp indirect_fun_fun@PLT
; FULL-DAG: .align 8, 0x90
; FULL-DAG: ud2
-; FULL-DAG: .globl __llvm_jump_instr_table_4_1
; FULL-DAG: .align 8, 0x90
; FULL-DAG: .type __llvm_jump_instr_table_4_1,@function
; FULL-DAG:__llvm_jump_instr_table_4_1:
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 82cefb7..6fb3879 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl -x86-asm-syntax=intel | FileCheck %s
define i32 @test1(i32 %A, i32 %B) {
%tmp1 = shl i32 %A, 2
diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll
index c439ee1..a56403a 100644
--- a/test/CodeGen/X86/lea-3.ll
+++ b/test/CodeGen/X86/lea-3.ll
@@ -1,4 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
; CHECK: leaq (,[[A0:%rdi|%rcx]],4), %rax
diff --git a/test/CodeGen/X86/lea-4.ll b/test/CodeGen/X86/lea-4.ll
index cef4726..00c2278 100644
--- a/test/CodeGen/X86/lea-4.ll
+++ b/test/CodeGen/X86/lea-4.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
+
define zeroext i16 @t1(i32 %on_off) nounwind {
entry:
diff --git a/test/CodeGen/X86/lea-5.ll b/test/CodeGen/X86/lea-5.ll
new file mode 100644
index 0000000..50d3aaf
--- /dev/null
+++ b/test/CodeGen/X86/lea-5.ll
@@ -0,0 +1,59 @@
+; test for more complicated forms of lea operands which can be generated
+; in loop optimized cases.
+; See also http://llvm.org/bugs/show_bug.cgi?id=20016
+
+; RUN: llc < %s -mtriple=x86_64-linux -O2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-nacl -O2 | FileCheck %s -check-prefix=X32
+
+; Function Attrs: nounwind readnone uwtable
+define void @foo(i32 %x, i32 %d) #0 {
+entry:
+ %a = alloca [8 x i32], align 16
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ]
+ %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0
+
+; CHECK: leaq -40(%rsp,%r{{[^,]*}},4), %rax
+; X32: leal -40(%rsp,%r{{[^,]*}},4), %eax
+ %0 = load i32* %arrayidx, align 4
+ %cmp1 = icmp eq i32 %0, 0
+ %inc = add nsw i32 %d.addr.0, 1
+
+; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}}
+; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}}
+ br i1 %cmp1, label %while.end, label %while.cond
+
+while.end: ; preds = %while.cond
+ ret void
+}
+
+; The same test as above but with enforsed stack realignment (%a aligned by 64)
+; to check one more case of correct lea generation.
+
+; Function Attrs: nounwind readnone uwtable
+define void @bar(i32 %x, i32 %d) #0 {
+entry:
+ %a = alloca [8 x i32], align 64
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ]
+ %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0
+
+; CHECK: leaq (%rsp,%r{{[^,]*}},4), %rax
+; X32: leal (%rsp,%r{{[^,]*}},4), %eax
+ %0 = load i32* %arrayidx, align 4
+ %cmp1 = icmp eq i32 %0, 0
+ %inc = add nsw i32 %d.addr.0, 1
+
+; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}}
+; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}}
+ br i1 %cmp1, label %while.end, label %while.cond
+
+while.end: ; preds = %while.cond
+ ret void
+}
+
diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll
index 93cfe46..9b6632c 100644
--- a/test/CodeGen/X86/lea.ll
+++ b/test/CodeGen/X86/lea.ll
@@ -1,5 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
define i32 @test1(i32 %x) nounwind {
%tmp1 = shl i32 %x, 3
diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll
deleted file mode 100644
index 5bbd41d..0000000
--- a/test/CodeGen/X86/long-extend.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
-define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
-; CHECK-LABEL: test_long_extend
-; CHECK: vpunpcklbw %xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
-; CHECK: vpunpckhwd %xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
-; CHECK: vpunpcklwd %xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
-; CHECK: vinsertf128 $1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
-; CHECK: vpunpckhbw %xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
-; CHECK: vpunpckhwd %xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
-; CHECK: vpunpcklwd %xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
-; CHECK: vinsertf128 $1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
-; CHECK: vmovaps [[REG_result1]], 32(%rdi)
-; CHECK: vmovaps [[REG_result0]], (%rdi)
-
- %tmp = zext <16 x i8> %a to <16 x i32>
- store <16 x i32> %tmp, <16 x i32>*%p
- ret void
-}
diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll
index 1d04276..c36047c 100644
--- a/test/CodeGen/X86/loop-strength-reduce8.ll
+++ b/test/CodeGen/X86/loop-strength-reduce8.ll
@@ -1,6 +1,9 @@
; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; CHECK: leal 16(%eax), %edx
+; FIXME: The first two instructions, movl and addl, should have been combined to
+; "leal 16(%eax), %edx" by the backend (PR20776).
+; CHECK: movl %eax, %edx
+; CHECK: addl $16, %edx
; CHECK: align
; CHECK: addl $4, %edx
; CHECK: decl %ecx
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index f47161e..edb8433e 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -68,13 +68,13 @@ define i64 @test4(i64 %A) {
%2 = bitcast <2 x i32> %add to i64
ret i64 %2
}
-; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
; Ideally, we should fold that sequence into a single paddd. This is fixed with
; the widening legalization.
;
; CHECK-LABEL: test4
; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
; CHECK-NEXT: pshufd
; CHECK: ret
;
diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll
new file mode 100644
index 0000000..dd7f396
--- /dev/null
+++ b/test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=i686-windows -mattr=+sse2 < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; There is a conflict between lowering the X86 memory intrinsics and the "base"
+; register used to address stack locals. See X86RegisterInfo::hasBaseRegister
+; for when this is necessary. Typically, we chose ESI for the base register,
+; which all of the X86 string instructions use.
+
+; The pattern of vector icmp and extractelement is used in these tests because
+; it forces creation of an aligned stack temporary. Perhaps such temporaries
+; shouldn't be aligned.
+
+declare void @escape_vla_and_icmp(i8*, i1 zeroext)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
+
+define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
+ br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+ ret i32 0
+
+spill_vectors:
+ %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+ %v0 = load <4 x i32>* %vp0
+ %v1 = load <4 x i32>* %vp1
+ %vicmp = icmp slt <4 x i32> %v0, %v1
+ %icmp = extractelement <4 x i1> %vicmp, i32 0
+ call void @escape_vla_and_icmp(i8* null, i1 zeroext %icmp)
+ %r = extractelement <4 x i32> %v0, i32 0
+ ret i32 %r
+}
+
+; CHECK-LABEL: _memcpy_novla_vector:
+; CHECK: andl $-16, %esp
+; CHECK-DAG: movl $32, %ecx
+; CHECK-DAG: movl {{.*}}, %esi
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: rep;movsl
+
+define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
+ br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+ ret i32 0
+
+spill_vectors:
+ %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+ %v0 = load <4 x i32>* %vp0
+ %v1 = load <4 x i32>* %vp1
+ %vicmp = icmp slt <4 x i32> %v0, %v1
+ %icmp = extractelement <4 x i1> %vicmp, i32 0
+ %vla = alloca i8, i32 %n
+ call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp)
+ %r = extractelement <4 x i32> %v0, i32 0
+ ret i32 %r
+}
+
+; CHECK-LABEL: _memcpy_vla_vector:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK: movl $128, {{.*}}(%esp)
+; CHECK: calll _memcpy
+; CHECK: calll __chkstk
+
+; stosd doesn't clobber esi, so we can use it.
+
+define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) {
+ call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false)
+ br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+ ret i32 0
+
+spill_vectors:
+ %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+ %v0 = load <4 x i32>* %vp0
+ %v1 = load <4 x i32>* %vp1
+ %vicmp = icmp slt <4 x i32> %v0, %v1
+ %icmp = extractelement <4 x i1> %vicmp, i32 0
+ %vla = alloca i8, i32 %n
+ call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp)
+ %r = extractelement <4 x i32> %v0, i32 0
+ ret i32 %r
+}
+
+; CHECK-LABEL: _memset_vla_vector:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK-DAG: movl $707406378, %eax # imm = 0x2A2A2A2A
+; CHECK-DAG: movl $32, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK-NOT: movl {{.*}}, %esi
+; CHECK: rep;stosl
+
+; Add a test for memcmp if we ever add a special lowering for it.
diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 0015df0..ea38b95 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll
@@ -1,8 +1,8 @@
; Test the basic functionality of integer element promotions of different types.
; This tests checks passing of arguments, loading and storing to memory and
; basic arithmetic.
-; RUN: llc -march=x86 < %s
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -march=x86 < %s > /dev/null
+; RUN: llc -march=x86-64 < %s > /dev/null
define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
%bb = load <1 x i8>* %b
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 3ea6512..5454b7c 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -10,7 +10,7 @@
; more complex cases.
;
; CHECK: @wrap_mul4
-; CHECK: 22 regalloc - Number of spills inserted
+; CHECK: 23 regalloc - Number of spills inserted
define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
entry:
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index 71b0723..96c5dbb 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -3,40 +3,58 @@
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
define i32 @test1() nounwind readonly {
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl %gs:196, %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movq %gs:320, %rax
+; X64-NEXT: movl (%rax), %eax
+; X64-NEXT: retq
entry:
%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31) ; <i32*> [#uses=1]
%tmp1 = load i32* %tmp ; <i32> [#uses=1]
ret i32 %tmp1
}
-; X32-LABEL: test1:
-; X32: movl %gs:196, %eax
-; X32: movl (%eax), %eax
-; X32: ret
-
-; X64-LABEL: test1:
-; X64: movq %gs:320, %rax
-; X64: movl (%rax), %eax
-; X64: ret
define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind {
+; X32-LABEL: test2:
+; X32: # BB#0: # %entry
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: calll *%gs:(%eax)
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0: # %entry
+; X64-NEXT: {{(subq.*%rsp|pushq)}}
+; X64-NEXT: callq *%gs:(%{{(rcx|rdi)}})
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: {{(addq.*%rsp|popq)}}
+; X64-NEXT: retq
entry:
%tmp9 = load void (i8*)* addrspace(256)* %tmp8, align 8
tail call void %tmp9(i8* undef) nounwind optsize
ret i64 0
}
-; rdar://8453210
-; X32-LABEL: test2:
-; X32: movl {{.*}}(%esp), %eax
-; X32: calll *%gs:(%eax)
-
-; X64-LABEL: test2:
-; X64: callq *%gs:([[A0:%rdi|%rcx]])
-
-
-
-
define <2 x i64> @pmovsxwd_1(i64 addrspace(256)* %p) nounwind readonly {
+; X32-LABEL: pmovsxwd_1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pmovsxwd %gs:(%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: pmovsxwd_1:
+; X64: # BB#0: # %entry
+; X64-NEXT: pmovsxwd %gs:(%{{(rcx|rdi)}}), %xmm0
+; X64-NEXT: retq
entry:
%0 = load i64 addrspace(256)* %p
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0
@@ -44,20 +62,26 @@ entry:
%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone
%3 = bitcast <4 x i32> %2 to <2 x i64>
ret <2 x i64> %3
-
-; X32-LABEL: pmovsxwd_1:
-; X32: movl 4(%esp), %eax
-; X32: pmovsxwd %gs:(%eax), %xmm0
-; X32: ret
-
-; X64-LABEL: pmovsxwd_1:
-; X64: pmovsxwd %gs:([[A0]]), %xmm0
-; X64: ret
}
; The two loads here both look identical to selection DAG, except for their
; address spaces. Make sure they aren't CSE'd.
define i32 @test_no_cse() nounwind readonly {
+; X32-LABEL: test_no_cse:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl %gs:196, %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: movl %fs:196, %ecx
+; X32-NEXT: addl (%ecx), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_no_cse:
+; X64: # BB#0: # %entry
+; X64-NEXT: movq %gs:320, %rax
+; X64-NEXT: movl (%rax), %eax
+; X64-NEXT: movq %fs:320, %rcx
+; X64-NEXT: addl (%rcx), %eax
+; X64-NEXT: retq
entry:
%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31) ; <i32*> [#uses=1]
%tmp1 = load i32* %tmp ; <i32> [#uses=1]
@@ -66,9 +90,5 @@ entry:
%tmp4 = add i32 %tmp1, %tmp3
ret i32 %tmp4
}
-; X32-LABEL: test_no_cse:
-; X32: movl %gs:196
-; X32: movl %fs:196
-; X32: ret
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 6910515..f0bdbba 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -110,7 +110,7 @@ define i32 @t31() {
entry:
%val = alloca i32, align 64
store i32 -1, i32* %val, align 64
- call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1
+ call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val)
%sp = load i32* %val, align 64
ret i32 %sp
; CHECK-LABEL: t31:
@@ -125,3 +125,12 @@ entry:
; CHECK: movl (%esp), %eax
; CHECK: ret
}
+
+declare hidden void @other_func()
+
+define void @naked() #0 {
+ call void asm sideeffect inteldialect "call dword ptr $0", "*m,~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{esp},~{ebp},~{dirflag},~{fpsr},~{flags}"(void()* @other_func)
+ unreachable
+}
+
+attributes #0 = { naked }
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
new file mode 100644
index 0000000..1e99c14
--- /dev/null
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+
+; Test that we actually spill and reload all arguments in the variadic argument
+; pack. Doing a normal call will clobber all argument registers, and we will
+; spill around it. A simple adjustment should not require any XMM spills.
+
+declare void(i8*, ...)* @get_f(i8* %this)
+
+define void @f_thunk(i8* %this, ...) {
+ %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
+ musttail call void (i8*, ...)* %fptr(i8* %this, ...)
+ ret void
+}
+
+; Save and restore 6 GPRs, 8 XMMs, and AL around the call.
+
+; LINUX-LABEL: f_thunk:
+; LINUX-DAG: movq %rdi, {{.*}}
+; LINUX-DAG: movq %rsi, {{.*}}
+; LINUX-DAG: movq %rdx, {{.*}}
+; LINUX-DAG: movq %rcx, {{.*}}
+; LINUX-DAG: movq %r8, {{.*}}
+; LINUX-DAG: movq %r9, {{.*}}
+; LINUX-DAG: movb %al, {{.*}}
+; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp)
+; LINUX: callq get_f
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7
+; LINUX-DAG: movq {{.*}}, %rdi
+; LINUX-DAG: movq {{.*}}, %rsi
+; LINUX-DAG: movq {{.*}}, %rdx
+; LINUX-DAG: movq {{.*}}, %rcx
+; LINUX-DAG: movq {{.*}}, %r8
+; LINUX-DAG: movq {{.*}}, %r9
+; LINUX-DAG: movb {{.*}}, %al
+; LINUX: jmpq *{{.*}} # TAILCALL
+
+; WINDOWS-LABEL: f_thunk:
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq %rdx, {{.*}}
+; WINDOWS-DAG: movq %rcx, {{.*}}
+; WINDOWS-DAG: movq %r8, {{.*}}
+; WINDOWS-DAG: movq %r9, {{.*}}
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: callq get_f
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq {{.*}}, %rdx
+; WINDOWS-DAG: movq {{.*}}, %rcx
+; WINDOWS-DAG: movq {{.*}}, %r8
+; WINDOWS-DAG: movq {{.*}}, %r9
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+
+; This thunk shouldn't require any spills and reloads, assuming the register
+; allocator knows what it's doing.
+
+define void @g_thunk(i8* %fptr_i8, ...) {
+ %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
+ musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...)
+ ret void
+}
+
+; LINUX-LABEL: g_thunk:
+; LINUX-NOT: movq
+; LINUX: jmpq *%rdi # TAILCALL
+
+; WINDOWS-LABEL: g_thunk:
+; WINDOWS-NOT: movq
+; WINDOWS: jmpq *%rcx # TAILCALL
+
+; Do a simple multi-exit multi-bb test.
+
+%struct.Foo = type { i1, i8*, i8* }
+
+@g = external global i32
+
+define void @h_thunk(%struct.Foo* %this, ...) {
+ %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0
+ %cond = load i1* %cond_p
+ br i1 %cond, label %then, label %else
+
+then:
+ %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1
+ %a_i8 = load i8** %a_p
+ %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)*
+ musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...)
+ ret void
+
+else:
+ %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2
+ %b_i8 = load i8** %b_p
+ %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)*
+ store i32 42, i32* @g
+ musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...)
+ ret void
+}
+
+; LINUX-LABEL: h_thunk:
+; LINUX: jne
+; LINUX: jmpq *{{.*}} # TAILCALL
+; LINUX: jmpq *{{.*}} # TAILCALL
+; WINDOWS-LABEL: h_thunk:
+; WINDOWS: jne
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+; WINDOWS: jmpq *{{.*}} # TAILCALL
diff --git a/test/CodeGen/X86/nancvt.ll b/test/CodeGen/X86/nancvt.ll
index 8036710..8a665fa 100644
--- a/test/CodeGen/X86/nancvt.ll
+++ b/test/CodeGen/X86/nancvt.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc > %t
+; RUN: opt < %s -O3 | llc > %t
; RUN: grep 2147027116 %t | count 3
; RUN: grep 2147228864 %t | count 3
; RUN: grep 2146502828 %t | count 3
diff --git a/test/CodeGen/X86/narrow-shl-load.ll b/test/CodeGen/X86/narrow-shl-load.ll
index 30387925..5175bfc 100644
--- a/test/CodeGen/X86/narrow-shl-load.ll
+++ b/test/CodeGen/X86/narrow-shl-load.ll
@@ -30,40 +30,6 @@ while.end: ; preds = %while.cond
ret void
}
-
-; DAGCombiner shouldn't fold the sdiv (ashr) away.
-; rdar://8636812
-; CHECK-LABEL: test2:
-; CHECK: sarl
-
-define i32 @test2() nounwind {
-entry:
- %i = alloca i32, align 4
- %j = alloca i8, align 1
- store i32 127, i32* %i, align 4
- store i8 0, i8* %j, align 1
- %tmp3 = load i32* %i, align 4
- %mul = mul nsw i32 %tmp3, 2
- %conv4 = trunc i32 %mul to i8
- %conv5 = sext i8 %conv4 to i32
- %div6 = sdiv i32 %conv5, 2
- %conv7 = trunc i32 %div6 to i8
- %conv9 = sext i8 %conv7 to i32
- %cmp = icmp eq i32 %conv9, -1
- br i1 %cmp, label %if.then, label %if.end
-
-if.then: ; preds = %entry
- ret i32 0
-
-if.end: ; preds = %entry
- call void @abort() noreturn
- unreachable
-}
-
-declare void @abort() noreturn
-
-declare void @exit(i32) noreturn
-
; DAG Combiner can't fold this into a load of the 1'th byte.
; PR8757
define i32 @test3(i32 *%P) nounwind ssp {
diff --git a/test/CodeGen/X86/nonconst-static-ev.ll b/test/CodeGen/X86/nonconst-static-ev.ll
index f852cae..5449791 100644
--- a/test/CodeGen/X86/nonconst-static-ev.ll
+++ b/test/CodeGen/X86/nonconst-static-ev.ll
@@ -1,6 +1,5 @@
; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-; REQUIRES: shell
@0 = global i8 extractvalue ([1 x i8] select (i1 ptrtoint (i32* @1 to i1), [1 x i8] [ i8 1 ], [1 x i8] [ i8 2 ]), 0)
@1 = external global i32
diff --git a/test/CodeGen/X86/nonconst-static-iv.ll b/test/CodeGen/X86/nonconst-static-iv.ll
index 8fad39b..30613ef 100644
--- a/test/CodeGen/X86/nonconst-static-iv.ll
+++ b/test/CodeGen/X86/nonconst-static-iv.ll
@@ -1,6 +1,5 @@
; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-; REQUIRES: shell
@0 = global i8 insertvalue( { i8 } select (i1 ptrtoint (i32* @1 to i1), { i8 } { i8 1 }, { i8 } { i8 2 }), i8 0, 0)
@1 = external global i32
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
new file mode 100644
index 0000000..9d0cb9a
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+; Make sure that we generate non-temporal stores for the test cases below.
+
+define void @test1(<4 x float>* %dst) {
+; CHECK-LABEL: test1:
+; SSE: movntps
+; AVX: vmovntps
+ store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test2(<4 x i32>* %dst) {
+; CHECK-LABEL: test2:
+; SSE: movntps
+; AVX: vmovntps
+ store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test3(<2 x double>* %dst) {
+; CHECK-LABEL: test3:
+; SSE: movntps
+; AVX: vmovntps
+ store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
+ ret void
+}
+
+!1 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index fa77fcb..b559729 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,16 +14,16 @@ define void @f1() {
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !13}
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""}
+!0 = metadata !{metadata !"0x11\004\00 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ]
!1 = metadata !{metadata !"", metadata !""}
!2 = metadata !{}
!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00\00\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* null, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
!7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!8 = metadata !{metadata !"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
!9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null}
+!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ]
!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll
index ec35d29..0610f0b 100644
--- a/test/CodeGen/X86/object-size.ll
+++ b/test/CodeGen/X86/object-size.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s
; ModuleID = 'ts.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -12,8 +12,8 @@ entry:
%tmp = load i8** @p ; <i8*> [#uses=1]
%0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1]
%cmp = icmp ne i64 %0, -1 ; <i1> [#uses=1]
-; X64: movabsq $-1, [[RAX:%r..]]
-; X64: cmpq $-1, [[RAX]]
+; CHECK: movq $-1, [[RAX:%r..]]
+; CHECK: cmpq $-1, [[RAX]]
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
index 349ce7d..e30cb48 100644
--- a/test/CodeGen/X86/osx-private-labels.ll
+++ b/test/CodeGen/X86/osx-private-labels.ll
@@ -69,3 +69,20 @@
; CHECK: .section __DATA,__foobar,interposing
; CHECK-NEXT: .align 3
; CHECK-NEXT: L_private12:
+
+@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip"
+; CHECK: .section __DATA,__objc_classlist,regular,no_dead_strip
+; CHECK-NEXT: .align 2
+; CHECK-NEXT: L_private13:
+
+@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals"
+; CHECK: .section __TEXT,__objc_classname,cstring_literals
+; CHECK-NEXT: L_private14:
+
+@private15 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methname,cstring_literals"
+; CHECK: .section __TEXT,__objc_methname,cstring_literals
+; CHECK-NEXT: L_private15:
+
+@private16 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methtype,cstring_literals"
+; CHECK: .section __TEXT,__objc_methtype,cstring_literals
+; CHECK-NEXT: L_private16:
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index ec6564d..3efcc2e 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -3,58 +3,127 @@
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: test1:
-; CHECK: pshufd
-; CHECK-YONAH: pshufd
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test1:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 >
ret <4 x i32> %C
}
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: test2:
-; CHECK: palignr
-; CHECK-YONAH: shufps
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test2:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 >
ret <4 x i32> %C
}
define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: test3:
-; CHECK: palignr
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test3:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 >
ret <4 x i32> %C
}
define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: test4:
-; CHECK: palignr
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test4:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
ret <4 x i32> %C
}
define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind {
; CHECK-LABEL: test5:
-; CHECK: palignr
+; CHECK: # BB#0:
+; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-NEXT: movapd %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test5:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
ret <4 x float> %C
}
define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: test6:
-; CHECK: palignr
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test6:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; CHECK-YONAH-NEXT: por %xmm1, %xmm0
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 >
ret <8 x i16> %C
}
define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: test7:
-; CHECK: palignr
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test7:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; CHECK-YONAH-NEXT: por %xmm1, %xmm0
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 >
ret <8 x i16> %C
}
define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
; CHECK-LABEL: test8:
-; CHECK: palignr
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test8:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; CHECK-YONAH-NEXT: por %xmm1, %xmm0
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 >
ret <16 x i8> %C
}
@@ -65,8 +134,19 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
; was an UNDEF.)
define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: test9:
-; CHECK-NOT: palignr
-; CHECK: pshufb
+; CHECK: # BB#0:
+; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-YONAH-LABEL: test9:
+; CHECK-YONAH: # BB#0:
+; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0
+; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; CHECK-YONAH-NEXT: por %xmm0, %xmm1
+; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0
+; CHECK-YONAH-NEXT: retl
%C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 >
ret <8 x i16> %C
}
diff --git a/test/CodeGen/X86/patchpoint-invoke.ll b/test/CodeGen/X86/patchpoint-invoke.ll
new file mode 100644
index 0000000..192cacc
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-invoke.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=corei7 < %s | FileCheck %s
+
+; Test invoking of patchpoints
+;
+define i64 @patchpoint_invoke(i64 %p1, i64 %p2) {
+entry:
+; CHECK-LABEL: patchpoint_invoke:
+; CHECK-NEXT: .cfi_startproc
+; CHECK: [[FUNC_BEGIN:.L.*]]:
+; CHECK: .cfi_lsda 3, [[EXCEPTION_LABEL:.L[^ ]*]]
+; CHECK: pushq %rbp
+
+; Unfortunately, hardcode the name of the label that begins the patchpoint:
+; CHECK: .Ltmp0:
+; CHECK: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK-NEXT: xchgw %ax, %ax
+; CHECK-NEXT: [[PP_END:.L.*]]:
+; CHECK: ret
+ %resolveCall = inttoptr i64 -559038736 to i8*
+ %result = invoke i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall, i32 1, i64 %p1, i64 %p2)
+ to label %success unwind label %threw
+
+success:
+ ret i64 %result
+
+threw:
+ %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+ catch i8* null
+ ret i64 0
+}
+
+; Verify that the exception table was emitted:
+; CHECK: [[EXCEPTION_LABEL]]:
+; CHECK-NEXT: .byte 255
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 21
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 13
+; Verify that the unwind data covers the entire patchpoint region:
+; CHECK-NEXT: .long .Ltmp0-[[FUNC_BEGIN]]
+; CHECK-NEXT: .long [[PP_END]]-.Ltmp0
+
+
+; Verify that the stackmap section got emitted:
+; CHECK-LABEL: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .short 0
+; Num Functions
+; CHECK-NEXT: .long 1
+; Num LargeConstants
+; CHECK-NEXT: .long 0
+; Num Callsites
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .quad patchpoint_invoke
+
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
new file mode 100644
index 0000000..5e76bf8
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in $rax.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK: Ltmp
+; CHECK: movq %r{{.+}}, (%rsp)
+; CHECK: movq %r{{.+}}, %rax
+; CHECK: Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK: movq %rax, (%rsp)
+; CHECK: callq
+; FAST-LABEL: jscall_patchpoint_codegen:
+; FAST: Ltmp
+; FAST: movq %r{{.+}}, (%rsp)
+; FAST: movq %r{{.+}}, %rax
+; FAST: Ltmp
+; FAST-NEXT: movabsq $-559038736, %r11
+; FAST-NEXT: callq *%r11
+; FAST: movq %rax, (%rsp)
+; FAST: callq
+ %resolveCall2 = inttoptr i64 -559038736 to i8*
+ %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+ %resolveCall3 = inttoptr i64 -559038737 to i8*
+ tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+ ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK: Ltmp
+; CHECK: movq $6, 24(%rsp)
+; CHECK-NEXT: movl $4, 16(%rsp)
+; CHECK-NEXT: movq $2, (%rsp)
+; CHECK: Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; FAST-LABEL: jscall_patchpoint_codegen2:
+; FAST: Ltmp
+; FAST: movq $2, (%rsp)
+; FAST-NEXT: movl $4, 16(%rsp)
+; FAST-NEXT: movq $6, 24(%rsp)
+; FAST: Ltmp
+; FAST-NEXT: movabsq $-559038736, %r11
+; FAST-NEXT: callq *%r11
+ %call = inttoptr i64 -559038736 to i8*
+ %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+ ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK: Ltmp
+; CHECK: movq $10, 48(%rsp)
+; CHECK-NEXT: movl $8, 36(%rsp)
+; CHECK-NEXT: movq $6, 24(%rsp)
+; CHECK-NEXT: movl $4, 16(%rsp)
+; CHECK-NEXT: movq $2, (%rsp)
+; CHECK: Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; FAST-LABEL: jscall_patchpoint_codegen3:
+; FAST: Ltmp
+; FAST: movq $2, (%rsp)
+; FAST-NEXT: movl $4, 16(%rsp)
+; FAST-NEXT: movq $6, 24(%rsp)
+; FAST-NEXT: movl $8, 36(%rsp)
+; FAST-NEXT: movq $10, 48(%rsp)
+; FAST: Ltmp
+; FAST-NEXT: movabsq $-559038736, %r11
+; FAST-NEXT: callq *%r11
+ %call = inttoptr i64 -559038736 to i8*
+ %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+ ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index 62b1273..07148f0 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s
; Trivial patchpoint codegen
;
@@ -38,61 +39,6 @@ entry:
ret void
}
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in $rax.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK: Ltmp
-; CHECK: movq %r{{.+}}, (%rsp)
-; CHECK: movq %r{{.+}}, %rax
-; CHECK: Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-; CHECK: movq %rax, (%rsp)
-; CHECK: callq
- %resolveCall2 = inttoptr i64 -559038736 to i8*
- %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
- %resolveCall3 = inttoptr i64 -559038737 to i8*
- tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
- ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK: Ltmp
-; CHECK: movq $6, 24(%rsp)
-; CHECK-NEXT: movl $4, 16(%rsp)
-; CHECK-NEXT: movq $2, (%rsp)
-; CHECK: Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
- %call = inttoptr i64 -559038736 to i8*
- %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
- ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK: Ltmp
-; CHECK: movq $10, 48(%rsp)
-; CHECK-NEXT: movl $8, 36(%rsp)
-; CHECK-NEXT: movq $6, 24(%rsp)
-; CHECK-NEXT: movl $4, 16(%rsp)
-; CHECK-NEXT: movq $2, (%rsp)
-; CHECK: Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
- %call = inttoptr i64 -559038736 to i8*
- %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
- ret i64 %result
-}
-
; Test patchpoints reusing the same TargetConstant.
; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
; There is no way to verify this, since it depends on memory allocation.
@@ -125,6 +71,17 @@ entry:
ret void
}
+; Test large target address.
+define i64 @large_target_address_patchpoint_codegen() {
+entry:
+; CHECK-LABEL: large_target_address_patchpoint_codegen:
+; CHECK: movabsq $6153737369414576827, %r11
+; CHECK-NEXT: callq *%r11
+ %resolveCall2 = inttoptr i64 6153737369414576827 to i8*
+ %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
+ ret i64 %result
+}
+
declare void @llvm.experimental.stackmap(i64, i32, ...)
declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll
deleted file mode 100644
index f73ebb9..0000000
--- a/test/CodeGen/X86/peep-vector-extract-concat.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s
-; CHECK: pshufd $3, %xmm0, %xmm0
-
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64
-; %a is passed indirectly on Win64.
-; WIN64: movss 12(%rcx), %xmm0
-
-define float @foo(<8 x float> %a) nounwind {
- %c = extractelement <8 x float> %a, i32 3
- ret float %c
-}
diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll
deleted file mode 100644
index f958b6b..0000000
--- a/test/CodeGen/X86/peep-vector-extract-insert.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86-64 | grep "xorps %xmm0, %xmm0" | count 2
-
-define float @foo(<4 x float> %a) {
- %b = insertelement <4 x float> %a, float 0.0, i32 3
- %c = extractelement <4 x float> %b, i32 3
- ret float %c
-}
-define float @bar(float %a) {
- %b = insertelement <4 x float> <float 0x400B333340000000, float 4.5, float 0.0, float 0x4022666660000000>, float %a, i32 3
- %c = extractelement <4 x float> %b, i32 2
- ret float %c
-}
diff --git a/test/CodeGen/X86/peephole-fold-movsd.ll b/test/CodeGen/X86/peephole-fold-movsd.ll
new file mode 100644
index 0000000..09d9328
--- /dev/null
+++ b/test/CodeGen/X86/peephole-fold-movsd.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+;
+; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into
+; addpd.
+; rdar://problem/18236850
+
+%struct.S1 = type { double, double }
+
+@g = common global %struct.S1 zeroinitializer, align 8
+
+declare void @foo3(%struct.S1*)
+
+; CHECK: movsd {{[0-9]*}}(%rsp), [[R0:%xmm[0-9]+]]
+; CHECK: addpd [[R0]], %xmm{{[0-9]+}}
+
+define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) {
+ %1 = alloca <2 x double>, align 16
+ %tmpcast = bitcast <2 x double>* %1 to %struct.S1*
+ call void @foo3(%struct.S1* %tmpcast) #2
+ %p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0
+ %2 = load double* %p2, align 16
+ %p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1
+ %3 = load double* %p3, align 8
+ %4 = insertelement <2 x double> undef, double %2, i32 0
+ %5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1
+ %6 = insertelement <2 x double> undef, double %3, i32 1
+ %7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0
+ %8 = fadd <2 x double> %5, %7
+ store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16
+ ret void
+}
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 7bf8a61..8937d6a 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,32 +1,96 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t
-; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 14
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
define <4 x i32> @a(<4 x i32> %i) nounwind {
- %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
- ret <4 x i32> %A
+; SSE2-LABEL: a:
+; SSE2: movdqa {{.*}}, %[[X1:xmm[0-9]+]]
+; SSE2-NEXT: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %[[X1]], %xmm0
+; SSE2-NEXT: pmuludq %[[X1]], %[[X2]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: a:
+; SSE41: pmulld
+; SSE41-NEXT: retq
+entry:
+ %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
+ ret <4 x i32> %A
}
+
define <2 x i64> @b(<2 x i64> %i) nounwind {
- %A = mul <2 x i64> %i, < i64 117, i64 117 >
- ret <2 x i64> %A
+; ALL-LABEL: b:
+; ALL: pmuludq
+; ALL: pmuludq
+; ALL: pmuludq
+entry:
+ %A = mul <2 x i64> %i, < i64 117, i64 117 >
+ ret <2 x i64> %A
}
+
define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind {
- %A = mul <4 x i32> %i, %j
- ret <4 x i32> %A
+; SSE2-LABEL: c:
+; SSE2: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %[[X2]], %xmm1
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: c:
+; SSE41: pmulld
+; SSE41-NEXT: retq
+entry:
+ %A = mul <4 x i32> %i, %j
+ ret <4 x i32> %A
}
+
define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind {
- %A = mul <2 x i64> %i, %j
- ret <2 x i64> %A
+; ALL-LABEL: d:
+; ALL: pmuludq
+; ALL: pmuludq
+; ALL: pmuludq
+entry:
+ %A = mul <2 x i64> %i, %j
+ ret <2 x i64> %A
}
-; Use a call to force spills.
+
declare void @foo()
+
define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
- call void @foo()
- %A = mul <4 x i32> %i, %j
- ret <4 x i32> %A
+; SSE2-LABEL: e:
+; SSE2: movdqa {{[0-9]*}}(%rsp), %xmm0
+; SSE2-NEXT: pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]]
+; SSE2-NEXT: pmuludq %[[X2]], %xmm0
+; SSE2-NEXT: pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3]
+; SSE2-NEXT: pmuludq %[[X1]], %[[X2]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: addq ${{[0-9]+}}, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: e:
+; SSE41: pmulld {{[0-9]+}}(%rsp), %xmm
+; SSE41-NEXT: addq ${{[0-9]+}}, %rsp
+; SSE41-NEXT: retq
+entry:
+ ; Use a call to force spills.
+ call void @foo()
+ %A = mul <4 x i32> %i, %j
+ ret <4 x i32> %A
}
+
define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind {
- call void @foo()
- %A = mul <2 x i64> %i, %j
- ret <2 x i64> %A
+; ALL-LABEL: f:
+; ALL: pmuludq
+; ALL: pmuludq
+; ALL: pmuludq
+entry:
+ ; Use a call to force spills.
+ call void @foo()
+ %A = mul <2 x i64> %i, %j
+ ret <2 x i64> %A
}
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index e7e29e0..0bdb0ec 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -15,7 +15,7 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind {
entry:
; CHECK: v3f2d_ext_vec
; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
; CHECK: cvtps2pd
; AVX: v3f2d_ext_vec
; AVX: vcvtps2pd
@@ -28,7 +28,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind {
entry:
; CHECK: v4f2d_ext_vec
; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
; CHECK: cvtps2pd
; AVX: v4f2d_ext_vec
; AVX: vcvtps2pd
@@ -42,9 +42,9 @@ entry:
; CHECK: v8f2d_ext_vec
; CHECK: cvtps2pd
; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
; CHECK: cvtps2pd
; AVX: v8f2d_ext_vec
; AVX: vcvtps2pd
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
deleted file mode 100644
index 024b163..0000000
--- a/test/CodeGen/X86/pr12359.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
-define <16 x i8> @shuf(<16 x i8> %inval1) {
-entry:
- %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
- ret <16 x i8> %0
-; CHECK: shuf
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: pshufb
-; CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
index ff4532e..c2bb8d3 100644
--- a/test/CodeGen/X86/pr14161.ll
+++ b/test/CodeGen/X86/pr14161.ll
@@ -3,6 +3,12 @@
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+; CHECK-LABEL: good:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movdqa (%rdi), %xmm0
+; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pmovzxwq %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%2 = load <4 x i32>* %0, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
@@ -13,13 +19,17 @@ entry:
%8 = bitcast i32 %4 to <2 x i16>
%9 = bitcast i32 %5 to <2 x i16>
ret <2 x i16> %8
-; CHECK: good
-; CHECK: pminud
-; CHECK-NEXT: pmovzxwq
-; CHECK: ret
}
define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+; CHECK-LABEL: bad:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movdqa (%rdi), %xmm0
+; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: pmovzxwq %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%2 = load <4 x i32>* %0, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
@@ -30,9 +40,4 @@ entry:
%8 = bitcast i32 %4 to <2 x i16>
%9 = bitcast i32 %5 to <2 x i16>
ret <2 x i16> %9
-; CHECK: bad
-; CHECK: pminud
-; CHECK: pextrd
-; CHECK: pmovzxwq
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index c8aaf32..b4dc5fd 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -48,19 +48,22 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
; CHECK: test3
; CHECK: movzbl
-; CHECK: shrl
-; CHECK: andl $1
-; CHECK: andl $1
-; CHECK: vmovd
-; CHECK: pinsrd $1
-; CHECK: shrl $2
-; CHECK: andl $1
-; CHECK: pinsrd $2
-; CHECK: shrl $3
-; CHECK: andl $1
-; CHECK: pinsrd $3
-; CHECK: pslld
-; CHECK: psrad
-; CHECK: pmovsxdq
-; CHECK: pmovsxdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: vinsertf128
; CHECK: ret
diff --git a/test/CodeGen/X86/pr18846.ll b/test/CodeGen/X86/pr18846.ll
new file mode 100644
index 0000000..27801be
--- /dev/null
+++ b/test/CodeGen/X86/pr18846.ll
@@ -0,0 +1,139 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; pr18846 - needless avx spill/reload
+; Test for unnecessary repeated spills due to eliminateRedundantSpills failing
+; to recognise unaligned ymm load/stores to the stack.
+; Bugpoint reduced testcase.
+
+;CHECK-LABEL: _Z16opt_kernel_cachePfS_S_
+;CHECK-NOT: vmovups {{.*#+}} 32-byte Folded Spill
+;CHECK-NOT: vmovups {{.*#+}} 32-byte Folded Reload
+
+; Function Attrs: uwtable
+define void @_Z16opt_kernel_cachePfS_S_() #0 {
+entry:
+ br label %for.body29
+
+for.body29: ; preds = %for.body29, %entry
+ br i1 undef, label %for.body29, label %for.body65
+
+for.body65: ; preds = %for.body29
+ %0 = load float* undef, align 4, !tbaa !1
+ %vecinit7.i4448 = insertelement <8 x float> undef, float %0, i32 7
+ %1 = load float* null, align 4, !tbaa !1
+ %vecinit7.i4304 = insertelement <8 x float> undef, float %1, i32 7
+ %2 = load float* undef, align 4, !tbaa !1
+ %vecinit7.i4196 = insertelement <8 x float> undef, float %2, i32 7
+ %3 = or i64 0, 16
+ %add.ptr111.sum4096 = add i64 %3, 0
+ %4 = load <8 x float>* null, align 16, !tbaa !5
+ %add.ptr162 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr111.sum4096
+ %__v.i4158 = bitcast float* %add.ptr162 to <8 x float>*
+ %5 = load <8 x float>* %__v.i4158, align 16, !tbaa !5
+ %add.ptr158.sum40975066 = or i64 %add.ptr111.sum4096, 8
+ %add.ptr183 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr158.sum40975066
+ %__v.i4162 = bitcast float* %add.ptr183 to <8 x float>*
+ %6 = load <8 x float>* %__v.i4162, align 16, !tbaa !5
+ %add.ptr200.sum40995067 = or i64 undef, 8
+ %add.ptr225 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr200.sum40995067
+ %__v.i4167 = bitcast float* %add.ptr225 to <8 x float>*
+ %7 = load <8 x float>* %__v.i4167, align 4, !tbaa !5
+ %8 = load <8 x float>* undef, align 16, !tbaa !5
+ %add.ptr242.sum41015068 = or i64 0, 8
+ %add.ptr267 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr242.sum41015068
+ %__v.i4171 = bitcast float* %add.ptr267 to <8 x float>*
+ %9 = load <8 x float>* %__v.i4171, align 4, !tbaa !5
+ %mul.i4690 = fmul <8 x float> %7, undef
+ %add.i4665 = fadd <8 x float> undef, undef
+ %mul.i4616 = fmul <8 x float> %8, undef
+ %mul.i4598 = fmul <8 x float> undef, undef
+ %add.i4597 = fadd <8 x float> undef, %mul.i4598
+ %mul.i4594 = fmul <8 x float> %6, undef
+ %add.i4593 = fadd <8 x float> undef, %mul.i4594
+ %mul.i4578 = fmul <8 x float> %9, undef
+ %add.i4577 = fadd <8 x float> %add.i4593, %mul.i4578
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4577) #1
+ %10 = load <8 x float>* null, align 16, !tbaa !5
+ %11 = load <8 x float>* undef, align 16, !tbaa !5
+ %mul.i4564 = fmul <8 x float> %4, undef
+ %add.i4563 = fadd <8 x float> %10, %mul.i4564
+ %mul.i4560 = fmul <8 x float> %5, undef
+ %add.i4559 = fadd <8 x float> %11, %mul.i4560
+ %add.i4547 = fadd <8 x float> %add.i4563, undef
+ %mul.i4546 = fmul <8 x float> %7, undef
+ %add.i4545 = fadd <8 x float> undef, %mul.i4546
+ %mul.i4544 = fmul <8 x float> %8, undef
+ %add.i4543 = fadd <8 x float> %add.i4559, %mul.i4544
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4547) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4545) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4543) #1
+ %add.i4455 = fadd <8 x float> undef, undef
+ %mul.i4454 = fmul <8 x float> undef, undef
+ %add.i4453 = fadd <8 x float> undef, %mul.i4454
+ %mul.i4440 = fmul <8 x float> zeroinitializer, %vecinit7.i4448
+ %add.i4439 = fadd <8 x float> %add.i4455, %mul.i4440
+ %mul.i4438 = fmul <8 x float> %7, %vecinit7.i4448
+ %add.i4437 = fadd <8 x float> %add.i4453, %mul.i4438
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4439) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4437) #1
+ %add.i4413 = fadd <8 x float> zeroinitializer, undef
+ %mul.i4400 = fmul <8 x float> %8, undef
+ %add.i4399 = fadd <8 x float> undef, %mul.i4400
+ %add.i4397 = fadd <8 x float> %add.i4413, zeroinitializer
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4399) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4397) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+ %mul.i4330 = fmul <8 x float> %7, undef
+ %add.i4329 = fadd <8 x float> undef, %mul.i4330
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4329) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+ %mul.i4312 = fmul <8 x float> %4, undef
+ %add.i4311 = fadd <8 x float> undef, %mul.i4312
+ %mul.i4306 = fmul <8 x float> %6, undef
+ %add.i4305 = fadd <8 x float> undef, %mul.i4306
+ %add.i4295 = fadd <8 x float> %add.i4311, undef
+ %mul.i4294 = fmul <8 x float> %7, %vecinit7.i4304
+ %add.i4293 = fadd <8 x float> undef, %mul.i4294
+ %mul.i4292 = fmul <8 x float> %8, %vecinit7.i4304
+ %add.i4291 = fadd <8 x float> undef, %mul.i4292
+ %mul.i4290 = fmul <8 x float> %9, %vecinit7.i4304
+ %add.i4289 = fadd <8 x float> %add.i4305, %mul.i4290
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4295) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4293) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4291) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4289) #1
+ %12 = load <8 x float>* undef, align 16, !tbaa !5
+ %mul.i4274 = fmul <8 x float> undef, undef
+ %add.i4273 = fadd <8 x float> %12, %mul.i4274
+ %mul.i4258 = fmul <8 x float> %7, undef
+ %add.i4257 = fadd <8 x float> %add.i4273, %mul.i4258
+ %mul.i4254 = fmul <8 x float> %9, undef
+ %add.i4253 = fadd <8 x float> undef, %mul.i4254
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4257) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4253) #1
+ %mul.i = fmul <8 x float> %9, %vecinit7.i4196
+ %add.i = fadd <8 x float> undef, %mul.i
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1
+ call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i) #1
+ unreachable
+}
+
+; Function Attrs: nounwind
+declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"float", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/CodeGen/X86/pr21099.ll b/test/CodeGen/X86/pr21099.ll
new file mode 100644
index 0000000..07292c1
--- /dev/null
+++ b/test/CodeGen/X86/pr21099.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -O2 -march=x86-64 -verify-machineinstrs | FileCheck %s
+
+define void @pr21099(i64* %p) {
+; CHECK-LABEL: pr21099
+; CHECK: lock
+; CHECK-NEXT: addq $-2147483648
+; This number is INT32_MIN: 0x80000000UL
+ %1 = atomicrmw add i64* %p, i64 -2147483648 seq_cst
+ ret void
+}
diff --git a/test/CodeGen/X86/pr21529.ll b/test/CodeGen/X86/pr21529.ll
new file mode 100644
index 0000000..655bc84
--- /dev/null
+++ b/test/CodeGen/X86/pr21529.ll
@@ -0,0 +1,15 @@
+; RUN: llc -show-mc-encoding < %s | FileCheck %s
+
+; Test that the direct object emission selects the and variant with 8 bit
+; immediate.
+; We used to get this wrong when using direct object emission, but not when
+; reading assembly.
+
+; CHECK: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
+
+target triple = "x86_64-pc-linux"
+
+define void @f() {
+ %foo = alloca i8, align 32
+ ret void
+}
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
new file mode 100644
index 0000000..7fc9890
--- /dev/null
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s
+
+; Test that the pshufb mask comment is correct.
+
+define <16 x i8> @test1(<16 x i8> %V) {
+; CHECK-LABEL: test1:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>)
+ ret <16 x i8> %1
+}
+
+; Test that indexes larger than the size of the vector are shown masked (bottom 4 bits).
+
+define <16 x i8> @test2(<16 x i8> %V) {
+; CHECK-LABEL: test2:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>)
+ ret <16 x i8> %1
+}
+
+; Test that indexes with bit seven set are shown as zero.
+
+define <16 x i8> @test3(<16 x i8> %V) {
+; CHECK-LABEL: test3:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>)
+ ret <16 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index d8e4572..49d58f4 100644
--- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -2,10 +2,12 @@
; Without the last chance recoloring, this test fails with:
; "ran out of registers".
-; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
+; NOTE: With the fix to PR18883, we don't actually run out of registers here
+; any more, and so those checks are disabled. This test remains only for general coverage.
+; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
; Test whether failure due to cutoff for depth is reported
-; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
+; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
; Test whether failure due to cutoff for interference is reported
; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
new file mode 100644
index 0000000..83b86ac
--- /dev/null
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
+
+; If the target's divss/divps instructions are substantially
+; slower than rcpss/rcpps with a Newton-Raphson refinement,
+; we should generate the estimate sequence.
+
+; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
+; for details about the accuracy, speed, and implementation
+; differences of x86 reciprocal estimates.
+
+define float @reciprocal_estimate(float %x) #0 {
+ %div = fdiv fast float 1.0, %x
+ ret float %div
+
+; CHECK-LABEL: reciprocal_estimate:
+; CHECK: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate:
+; BTVER2: vrcpss
+; BTVER2: vmulss
+; BTVER2: vsubss
+; BTVER2: vmulss
+; BTVER2: vaddss
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate:
+; REFINE: vrcpss
+; REFINE: vmulss
+; REFINE: vsubss
+; REFINE: vmulss
+; REFINE: vaddss
+; REFINE: vmulss
+; REFINE: vsubss
+; REFINE: vmulss
+; REFINE: vaddss
+; REFINE-NEXT: retq
+}
+
+define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
+ %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+ ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v4f32:
+; BTVER2: vrcpps
+; BTVER2: vmulps
+; BTVER2: vsubps
+; BTVER2: vmulps
+; BTVER2: vaddps
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate_v4f32:
+; REFINE: vrcpps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE-NEXT: retq
+}
+
+define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
+ %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+ ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v8f32:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v8f32:
+; BTVER2: vrcpps
+; BTVER2: vmulps
+; BTVER2: vsubps
+; BTVER2: vmulps
+; BTVER2: vaddps
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate_v8f32:
+; REFINE: vrcpps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE-NEXT: retq
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/return_zeroext_i2.ll b/test/CodeGen/X86/return_zeroext_i2.ll
new file mode 100644
index 0000000..d535b0c
--- /dev/null
+++ b/test/CodeGen/X86/return_zeroext_i2.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=i386-pc-win32 < %s | FileCheck %s
+; Check that the testcase does not crash
+define zeroext i2 @crash () {
+ ret i2 0
+}
+; CHECK: xorl %eax, %eax
+; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index b82be41..e34ba54 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -1,7 +1,9 @@
; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
; Just to prevent the alloca from being optimized away
declare void @dummy_use(i32*, i32)
@@ -61,6 +63,26 @@ false:
; X64-NEXT: callq __morestack_allocate_stack_space
; X64: movq %rax, %rdi
+; X32ABI-LABEL: test_basic:
+
+; X32ABI: cmpl %fs:64, %esp
+; X32ABI-NEXT: ja .LBB0_2
+
+; X32ABI: movl $24, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
+; X32ABI: movl %esp, %[[EDI:edi|eax]]
+; X32ABI: subl %{{.*}}, %[[EDI]]
+; X32ABI-NEXT: cmpl %[[EDI]], %fs:64
+
+; X32ABI: movl %[[EDI]], %esp
+
+; X32ABI: movl %{{.*}}, %edi
+; X32ABI-NEXT: callq __morestack_allocate_stack_space
+; X32ABI: movl %eax, %edi
+
}
attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 9dab3cd..2db7c11 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
@@ -9,6 +10,7 @@
; We used to crash with filetype=obj
; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
@@ -51,6 +53,16 @@ define void @test_basic() #0 {
; X64-Linux-NEXT: callq __morestack
; X64-Linux-NEXT: ret
+; X32ABI-LABEL: test_basic:
+
+; X32ABI: cmpl %fs:64, %esp
+; X32ABI-NEXT: ja .LBB0_2
+
+; X32ABI: movl $40, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
; X32-Darwin-LABEL: test_basic:
; X32-Darwin: movl $432, %ecx
@@ -129,6 +141,16 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X64-Linux-NEXT: ret
; X64-Linux-NEXT: movq %rax, %r10
+; X32ABI: cmpl %fs:64, %esp
+; X32ABI-NEXT: ja .LBB1_2
+
+; X32ABI: movl %r10d, %eax
+; X32ABI-NEXT: movl $56, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+; X32ABI-NEXT: movq %rax, %r10
+
; X32-Darwin: movl $432, %edx
; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp
; X32-Darwin-NEXT: ja LBB1_2
@@ -202,6 +224,15 @@ define void @test_large() #0 {
; X64-Linux-NEXT: callq __morestack
; X64-Linux-NEXT: ret
+; X32ABI: leal -40008(%rsp), %r11d
+; X32ABI-NEXT: cmpl %fs:64, %r11d
+; X32ABI-NEXT: ja .LBB2_2
+
+; X32ABI: movl $40008, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
; X32-Darwin: leal -40012(%esp), %ecx
; X32-Darwin-NEXT: movl $432, %eax
; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx
@@ -276,6 +307,16 @@ define fastcc void @test_fastcc() #0 {
; X64-Linux-NEXT: callq __morestack
; X64-Linux-NEXT: ret
+; X32ABI-LABEL: test_fastcc:
+
+; X32ABI: cmpl %fs:64, %esp
+; X32ABI-NEXT: ja .LBB3_2
+
+; X32ABI: movl $40, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
; X32-Darwin-LABEL: test_fastcc:
; X32-Darwin: movl $432, %eax
@@ -356,6 +397,17 @@ define fastcc void @test_fastcc_large() #0 {
; X64-Linux-NEXT: callq __morestack
; X64-Linux-NEXT: ret
+; X32ABI-LABEL: test_fastcc_large:
+
+; X32ABI: leal -40008(%rsp), %r11d
+; X32ABI-NEXT: cmpl %fs:64, %r11d
+; X32ABI-NEXT: ja .LBB4_2
+
+; X32ABI: movl $40008, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
; X32-Darwin-LABEL: test_fastcc_large:
; X32-Darwin: leal -40012(%esp), %eax
@@ -446,6 +498,9 @@ define void @test_nostack() #0 {
; X64-Linux-LABEL: test_nostack:
; X32-Linux-NOT: callq __morestack
+; X32ABI-LABEL: test_nostack:
+; X32ABI-NOT: callq __morestack
+
; X32-Darwin-LABEL: test_nostack:
; X32-Darwin-NOT: calll __morestack
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index cdd258d..7e6f153 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -357,3 +357,47 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; ATOM: cmpl $15, %edi
; ATOM: cmovgel %edx
}
+
+; CHECK-LABEL: @trunc_select_miscompile
+; CHECK-NOT: sarb
+define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
+ %tmp1 = select i1 %cc, i32 3, i32 2
+ %tmp2 = shl i32 %a, %tmp1
+ ret i32 %tmp2
+}
+
+define void @test19() {
+; This is a massive reduction of an llvm-stress test case that generates
+; interesting chains feeding setcc and eventually a f32 select operation. This
+; is intended to exercise the SELECT formation in the DAG combine simplifying
+; a simplified select_cc node. If it it regresses and is no longer triggering
+; that code path, it can be deleted.
+;
+; CHECK-LABEL: @test19
+; CHECK: testb
+; CHECK: cmpl
+; CHECK: ucomiss
+
+BB:
+ br label %CF
+
+CF:
+ %Cmp10 = icmp ule i8 undef, undef
+ br i1 %Cmp10, label %CF, label %CF250
+
+CF250:
+ %E12 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2
+ %Cmp32 = icmp ugt i1 %Cmp10, false
+ br i1 %Cmp32, label %CF, label %CF242
+
+CF242:
+ %Cmp38 = icmp uge i32 %E12, undef
+ %FC = uitofp i1 %Cmp38 to float
+ %Sl59 = select i1 %Cmp32, float %FC, float undef
+ %Cmp60 = fcmp ugt float undef, undef
+ br i1 %Cmp60, label %CF242, label %CF244
+
+CF244:
+ %B122 = fadd float %Sl59, undef
+ ret void
+}
diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll
index 64de0ae..1a575db 100644
--- a/test/CodeGen/X86/sext-i1.ll
+++ b/test/CodeGen/X86/sext-i1.ll
@@ -61,3 +61,36 @@ if.end: ; preds = %if.then, %entry
%xor27 = xor i32 undef, %cond ; <i32> [#uses=0]
ret i32 0
}
+
+define i32 @t4(i64 %x) nounwind readnone ssp {
+entry:
+; 32-LABEL: t4:
+; 32: movl
+; 32: orl
+; 32: movl
+; 32: je
+; 32: xorl
+
+; 64-LABEL: t4:
+; 64: cmpq $1
+; 64: sbbl
+ %0 = icmp eq i64 %x, 0
+ %1 = sext i1 %0 to i32
+ ret i32 %1
+}
+
+define i64 @t5(i32 %x) nounwind readnone ssp {
+entry:
+; 32-LABEL: t5:
+; 32: cmpl $1
+; 32: sbbl
+; 32: movl
+
+; 64-LABEL: t5:
+; 64: cmpl $1
+; 64: sbbq
+ %0 = icmp eq i32 %x, 0
+ %1 = sext i1 %0 to i64
+ ret i64 %1
+}
+
diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll
index ddad307..763da63 100644
--- a/test/CodeGen/X86/shift-parts.ll
+++ b/test/CodeGen/X86/shift-parts.ll
@@ -7,13 +7,13 @@
; CHECK: shrdq
-define i32 @int87(i32 %uint64p_8) nounwind {
+define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind {
entry:
%srcval4 = load i320* bitcast (%0* @g_144 to i320*), align 8 ; <i320> [#uses=1]
br label %for.cond
for.cond: ; preds = %for.cond, %entry
- %call3.in.in.in.v = select i1 undef, i320 192, i320 128 ; <i320> [#uses=1]
+ %call3.in.in.in.v = select i1 %cond, i320 192, i320 128 ; <i320> [#uses=1]
%call3.in.in.in = lshr i320 %srcval4, %call3.in.in.in.v ; <i320> [#uses=1]
%call3.in = trunc i320 %call3.in.in.in to i32 ; <i32> [#uses=1]
%tobool = icmp eq i32 %call3.in, 0 ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/shuffle-combine-crash.ll b/test/CodeGen/X86/shuffle-combine-crash.ll
new file mode 100644
index 0000000..6ab7b97
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-combine-crash.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7
+
+; Verify that DAGCombiner does not crash when checking if it is
+; safe to fold the shuffles in function @sample_test according to rule
+; (shuffle (shuffle A, Undef, M0), Undef, M1) -> (shuffle A, Undef, M2)
+;
+; The DAGCombiner avoids folding shuffles if
+; the resulting shuffle dag node is not legal for the target.
+; That means, the shuffle must have legal type and legal mask.
+;
+; Before, the DAGCombiner forgot to check if the resulting shuffle
+; was legal. It instead just called method
+; 'X86TargetLowering::isShuffleMaskLegal'; however, that was not enough since
+; that method always expect to have a valid vector type in input.
+; As a consequence, compiling the function below would have caused a crash.
+
+define void @sample_test() {
+ br i1 undef, label %5, label %1
+
+; <label>:1 ; preds = %0
+ %2 = load <4 x i8>* undef
+ %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+ %4 = shufflevector <4 x i8> %3, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ store <4 x i8> %4, <4 x i8>* undef
+ br label %5
+
+; <label>:5 ; preds = %1, %0
+ ret void
+}
+
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index 2dc8816..1e34a2b 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -15,7 +15,8 @@ entry:
; OSX_SINCOS-LABEL: test1:
; OSX_SINCOS: callq ___sincosf_stret
-; OSX_SINCOS: pshufd $1, %xmm0, %xmm1
+; OSX_SINCOS: movaps %xmm0, %xmm1
+; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3]
; OSX_SINCOS: addss %xmm0, %xmm1
; OSX_NOOPT: test1
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
new file mode 100644
index 0000000..6e3a003
--- /dev/null
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -0,0 +1,45 @@
+; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
+; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
+
+; Test that by changing BlockFrequencyInfo we change the order in which
+; machine-sink looks for sucessor blocks. By not using BFI, both G and B
+; have the same loop depth and no instructions is sinked - B is selected but
+; can't be used as to avoid breaking a non profitable critical edge. By using
+; BFI, "mul" is sinked into the less frequent block G.
+define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp {
+; MSINK_BFI-LABEL: sink_freqinfo
+; MSINK_BFI: jl
+; MSINK_BFI-NEXT: ## BB#
+; MSINK_BFI-NEXT: imull
+
+; MSINK_NOBFI-LABEL: sink_freqinfo
+; MSINK_NOBFI: imull
+; MSINK_NOBFI: jl
+entry:
+ br label %B
+
+B:
+ %ee = phi i32 [ 0, %entry ], [ %inc, %F ]
+ %xx = sub i32 %a, %ee
+ %cond0 = icmp slt i32 %xx, 0
+ br i1 %cond0, label %F, label %exit, !prof !0
+
+F:
+ %inc = add nsw i32 %xx, 2
+ %aa = mul nsw i32 %b, %inc
+ %exitcond = icmp slt i32 %inc, %a
+ br i1 %exitcond, label %B, label %G, !prof !1
+
+G:
+ %ii = add nsw i32 %aa, %a
+ %ll = add i32 %b, 45
+ %exitcond2 = icmp sge i32 %ii, %b
+ br i1 %exitcond2, label %G, label %exit, !prof !2
+
+exit:
+ ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 128, i32 1}
+!2 = metadata !{metadata !"branch_weights", i32 1, i32 1}
diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
index c600f92..6757f31 100644
--- a/test/CodeGen/X86/sink-out-of-loop.ll
+++ b/test/CodeGen/X86/sink-out-of-loop.ll
@@ -5,7 +5,7 @@
; MOV32ri outside the loop.
; rdar://11980766
define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp {
-; CHECK: sink_succ
+; CHECK-LABEL: sink_succ
; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader
; CHECK: %exit
; CHECK-NOT: movl
@@ -52,3 +52,24 @@ for.body2:
for.end20:
ret i32 0
}
+
+define i32 @sink_out_of_loop(i32 %n, i32* %output) {
+; CHECK-LABEL: sink_out_of_loop:
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i2, %loop ]
+ %j = mul i32 %i, %i
+ %addr = getelementptr i32* %output, i32 %i
+ store i32 %i, i32* %addr
+ %i2 = add i32 %i, 1
+ %exit_cond = icmp sge i32 %i2, %n
+ br i1 %exit_cond, label %exit, label %loop
+
+exit:
+; CHECK: BB#2
+; CHECK: imull %eax, %eax
+; CHECK: retq
+ ret i32 %j
+}
diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll
new file mode 100644
index 0000000..541d992
--- /dev/null
+++ b/test/CodeGen/X86/slow-incdec.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s
+
+; check -mattr=-slow-incdec
+; INCDEC-NOT: addl $-1
+; INCDEC: dec
+; INCDEC-NOT: addl $1
+; INCDEC: inc
+
+; check -mattr=+slow-incdec
+; ADD: addl $-1
+; ADD-NOT: dec
+; ADD: addl $1
+; ADD-NOT: inc
+
+; Function Attrs: nounwind readonly
+define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 {
+entry:
+ %cmp5 = icmp eq i32 %s, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond: ; preds = %for.body
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.cond
+ %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32* %a, i32 %i.06
+ %0 = load i32* %arrayidx, align 4, !tbaa !1
+ %cmp1 = icmp eq i32 %0, 0
+;
+ %dec = add nsw i32 %i.06, -1
+ br i1 %cmp1, label %for.end.loopexit, label %for.cond
+
+for.end.loopexit: ; preds = %for.cond, %for.body
+ %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+; Function Attrs: nounwind readonly
+define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 {
+entry:
+ %cmp5 = icmp eq i32 %s, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond: ; preds = %for.body
+ %cmp = icmp eq i32 %inc, 0
+ br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.cond
+ %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32* %a, i32 %i.06
+ %0 = load i32* %arrayidx, align 4, !tbaa !1
+ %cmp1 = icmp eq i32 %0, 0
+ %inc = add nsw i32 %i.06, 1
+ br i1 %cmp1, label %for.end.loopexit, label %for.cond
+
+for.end.loopexit: ; preds = %for.cond, %for.body
+ %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
new file mode 100644
index 0000000..c052ad2
--- /dev/null
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -0,0 +1,141 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2
+
+; Check constant loads of every 128-bit and 256-bit vector type
+; for size optimization using splat ops available with AVX and AVX2.
+
+; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
+define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
+ %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+ ret <2 x double> %add
+; CHECK-LABEL: splat_v2f64
+; CHECK: vmovddup
+; CHECK: vaddpd
+; CHECK-NEXT: retq
+}
+
+define <4 x double> @splat_v4f64(<4 x double> %x) #0 {
+ %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+ ret <4 x double> %add
+; CHECK-LABEL: splat_v4f64
+; CHECK: vbroadcastsd
+; CHECK-NEXT: vaddpd
+; CHECK-NEXT: retq
+}
+
+define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
+ %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+ ret <4 x float> %add
+; CHECK-LABEL: splat_v4f32
+; CHECK: vbroadcastss
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+define <8 x float> @splat_v8f32(<8 x float> %x) #0 {
+ %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+ ret <8 x float> %add
+; CHECK-LABEL: splat_v8f32
+; CHECK: vbroadcastss
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
+; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
+define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 {
+ %add = add <2 x i64> %x, <i64 1, i64 1>
+ ret <2 x i64> %add
+; CHECK-LABEL: splat_v2i64
+; CHECK: vmovddup
+; CHECK: vpaddq
+; CHECK-NEXT: retq
+}
+
+; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
+; and then we fake it: use vmovddup to splat 64-bit value.
+define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
+ %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i64> %add
+; CHECK-LABEL: splat_v4i64
+; AVX: vmovddup
+; AVX: vpaddq
+; AVX: vpaddq
+; AVX2: vpbroadcastq
+; AVX2: vpaddq
+; CHECK: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 {
+ %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+; CHECK-LABEL: splat_v4i32
+; AVX: vbroadcastss
+; AVX2: vpbroadcastd
+; CHECK-NEXT: vpaddd
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
+ %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %add
+; CHECK-LABEL: splat_v8i32
+; AVX: vbroadcastss
+; AVX: vpaddd
+; AVX: vpaddd
+; AVX2: vpbroadcastd
+; AVX2: vpaddd
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 {
+ %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %add
+; CHECK-LABEL: splat_v8i16
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastw
+; CHECK: vpaddw
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
+ %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <16 x i16> %add
+; CHECK-LABEL: splat_v16i16
+; AVX-NOT: broadcast
+; AVX: vpaddw
+; AVX: vpaddw
+; AVX2: vpbroadcastw
+; AVX2: vpaddw
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 {
+ %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %add
+; CHECK-LABEL: splat_v16i8
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastb
+; CHECK: vpaddb
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
+ %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <32 x i8> %add
+; CHECK-LABEL: splat_v32i8
+; AVX-NOT: broadcast
+; AVX: vpaddb
+; AVX: vpaddb
+; AVX2: vpbroadcastb
+; AVX2: vpaddb
+; CHECK: retq
+}
+
+attributes #0 = { optsize }
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
deleted file mode 100644
index 4d59b9c..0000000
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s
-; rdar://7434544
-
-define <2 x i64> @t2() nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: pshufd $85, (%esp), %xmm0
- %array = alloca [8 x float], align 4
- %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1
- %tmp2 = load float* %arrayidx
- %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
- %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
- %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
- %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
- %0 = bitcast <4 x float> %vecinit9 to <2 x i64>
- ret <2 x i64> %0
-}
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index fc79e31..24b175e 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
; #include <math.h>
@@ -52,9 +53,80 @@ entry:
ret x86_fp80 %call
}
-; Function Attrs: nounwind readnone
declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
+declare float @llvm.sqrt.f32(float) #1
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
+
+; If the target's sqrtss and divss instructions are substantially
+; slower than rsqrtss with a Newton-Raphson refinement, we should
+; generate the estimate sequence.
+
+define float @reciprocal_square_root(float %x) #0 {
+ %sqrt = tail call float @llvm.sqrt.f32(float %x)
+ %div = fdiv fast float 1.0, %sqrt
+ ret float %div
+
+; CHECK-LABEL: reciprocal_square_root:
+; CHECK: sqrtss
+; CHECK-NEXT: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root:
+; BTVER2: vrsqrtss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vaddss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: retq
+}
+
+define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
+ %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
+ %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+ ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v4f32:
+; CHECK: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v4f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
+ %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
+ %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+ ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v8f32:
+; CHECK: sqrtps
+; CHECK-NEXT: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v8f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+
attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 2351fd6..396da0f 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s
-; CHECK-LABEL: a:
-; CHECK: movdqu
-; CHECK: pshufd
define <4 x float> @a(<4 x float>* %y) nounwind {
+; CHECK-LABEL: a:
+; CHECK: # BB#0:
+; CHECK-NEXT: movups (%rdi), %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retq
%x = load <4 x float>* %y, align 4
%a = extractelement <4 x float> %x, i32 0
%b = extractelement <4 x float> %x, i32 1
@@ -16,10 +18,12 @@ define <4 x float> @a(<4 x float>* %y) nounwind {
ret <4 x float> %s
}
-; CHECK-LABEL: b:
-; CHECK: movups
-; CHECK: unpckhps
define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
+; CHECK-LABEL: b:
+; CHECK: # BB#0:
+; CHECK-NEXT: movups (%rdi), %xmm1
+; CHECK-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
%x = load <4 x float>* %y, align 4
%a = extractelement <4 x float> %x, i32 2
%b = extractelement <4 x float> %x, i32 3
@@ -32,10 +36,12 @@ define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
ret <4 x float> %s
}
-; CHECK-LABEL: c:
-; CHECK: movupd
-; CHECK: shufpd
define <2 x double> @c(<2 x double>* %y) nounwind {
+; CHECK-LABEL: c:
+; CHECK: # BB#0:
+; CHECK-NEXT: movupd (%rdi), %xmm0
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: retq
%x = load <2 x double>* %y, align 8
%a = extractelement <2 x double> %x, i32 0
%c = extractelement <2 x double> %x, i32 1
@@ -44,10 +50,12 @@ define <2 x double> @c(<2 x double>* %y) nounwind {
ret <2 x double> %r
}
-; CHECK-LABEL: d:
-; CHECK: movupd
-; CHECK: unpckhpd
define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind {
+; CHECK-LABEL: d:
+; CHECK: # BB#0:
+; CHECK-NEXT: movupd (%rdi), %xmm1
+; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
%x = load <2 x double>* %y, align 8
%a = extractelement <2 x double> %x, i32 1
%c = extractelement <2 x double> %z, i32 1
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 5122c44..da36a42 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -138,8 +138,7 @@ define double @ole_inverse(double %x, double %y) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: ogt_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ogt_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -157,8 +156,7 @@ define double @ogt_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: olt_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: olt_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -177,8 +175,7 @@ define double @olt_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: ogt_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ogt_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -198,8 +195,7 @@ define double @ogt_inverse_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: olt_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: olt_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -217,8 +213,7 @@ define double @olt_inverse_x(double %x) nounwind {
; CHECK-NEXT: andpd
; UNSAFE-LABEL: oge_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: oge_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -235,8 +230,7 @@ define double @oge_x(double %x) nounwind {
; CHECK-NEXT: andpd
; UNSAFE-LABEL: ole_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ole_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -253,8 +247,7 @@ define double @ole_x(double %x) nounwind {
; CHECK-NEXT: andnpd
; UNSAFE-LABEL: oge_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: oge_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -271,8 +264,7 @@ define double @oge_inverse_x(double %x) nounwind {
; CHECK: cmplesd %xmm
; UNSAFE-LABEL: ole_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ole_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -412,8 +404,7 @@ define double @ule_inverse(double %x, double %y) nounwind {
; CHECK-NEXT: andpd
; UNSAFE-LABEL: ugt_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ugt_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -430,8 +421,7 @@ define double @ugt_x(double %x) nounwind {
; CHECK-NEXT: andpd
; UNSAFE-LABEL: ult_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ult_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -448,8 +438,7 @@ define double @ult_x(double %x) nounwind {
; CHECK-NEXT: andnpd
; UNSAFE-LABEL: ugt_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ugt_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -467,8 +456,7 @@ define double @ugt_inverse_x(double %x) nounwind {
; CHECK-NEXT: andnpd
; UNSAFE-LABEL: ult_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ult_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -488,8 +476,7 @@ define double @ult_inverse_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: uge_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: uge_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -508,8 +495,7 @@ define double @uge_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: ule_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ule_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -527,8 +513,7 @@ define double @ule_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: uge_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: uge_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -547,8 +532,7 @@ define double @uge_inverse_x(double %x) nounwind {
; CHECK-NEXT: ret
; UNSAFE-LABEL: ule_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: ret
; FINITE-LABEL: ule_inverse_x:
; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
deleted file mode 100644
index 600ee1b..0000000
--- a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
+++ /dev/null
@@ -1,423 +0,0 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
-
-; Ensure that the backend selects SSE/AVX scalar fp instructions
-; from a packed fp instrution plus a vector insert.
-
-
-define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fadd <4 x float> %a, %b
- %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_add_ss
-; SSE2: addss %xmm1, %xmm0
-; AVX: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fsub <4 x float> %a, %b
- %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_sub_ss
-; SSE2: subss %xmm1, %xmm0
-; AVX: vsubss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fmul <4 x float> %a, %b
- %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_mul_ss
-; SSE2: mulss %xmm1, %xmm0
-; AVX: vmulss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fdiv <4 x float> %a, %b
- %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_div_ss
-; SSE2: divss %xmm1, %xmm0
-; AVX: vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fadd <2 x double> %a, %b
- %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_add_sd
-; SSE2: addsd %xmm1, %xmm0
-; AVX: vaddsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fsub <2 x double> %a, %b
- %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_sub_sd
-; SSE2: subsd %xmm1, %xmm0
-; AVX: vsubsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fmul <2 x double> %a, %b
- %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_mul_sd
-; SSE2: mulsd %xmm1, %xmm0
-; AVX: vmulsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fdiv <2 x double> %a, %b
- %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_div_sd
-; SSE2: divsd %xmm1, %xmm0
-; AVX: vdivsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fadd <4 x float> %b, %a
- %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_add_ss
-; SSE2: addss %xmm0, %xmm1
-; AVX: vaddss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fsub <4 x float> %b, %a
- %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_sub_ss
-; SSE2: subss %xmm0, %xmm1
-; AVX: vsubss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fmul <4 x float> %b, %a
- %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_mul_ss
-; SSE2: mulss %xmm0, %xmm1
-; AVX: vmulss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fdiv <4 x float> %b, %a
- %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_div_ss
-; SSE2: divss %xmm0, %xmm1
-; AVX: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fadd <2 x double> %b, %a
- %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_add_sd
-; SSE2: addsd %xmm0, %xmm1
-; AVX: vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fsub <2 x double> %b, %a
- %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_sub_sd
-; SSE2: subsd %xmm0, %xmm1
-; AVX: vsubsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fmul <2 x double> %b, %a
- %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_mul_sd
-; SSE2: mulsd %xmm0, %xmm1
-; AVX: vmulsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fdiv <2 x double> %b, %a
- %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_div_sd
-; SSE2: divsd %xmm0, %xmm1
-; AVX: vdivsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fadd <4 x float> %a, %b
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_add_ss
-; SSE2: addss %xmm1, %xmm0
-; AVX: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fsub <4 x float> %a, %b
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_sub_ss
-; SSE2: subss %xmm1, %xmm0
-; AVX: vsubss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fmul <4 x float> %a, %b
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_mul_ss
-; SSE2: mulss %xmm1, %xmm0
-; AVX: vmulss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fdiv <4 x float> %a, %b
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_div_ss
-; SSE2: divss %xmm1, %xmm0
-; AVX: vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fadd <2 x double> %a, %b
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_add_sd
-; SSE2: addsd %xmm1, %xmm0
-; AVX: vaddsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fsub <2 x double> %a, %b
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_sub_sd
-; SSE2: subsd %xmm1, %xmm0
-; AVX: vsubsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fmul <2 x double> %a, %b
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_mul_sd
-; SSE2: mulsd %xmm1, %xmm0
-; AVX: vmulsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fdiv <2 x double> %a, %b
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_div_sd
-; SSE2: divsd %xmm1, %xmm0
-; AVX: vdivsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fadd <4 x float> %b, %a
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_add_ss
-; SSE2: addss %xmm0, %xmm1
-; AVX: vaddss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fsub <4 x float> %b, %a
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_sub_ss
-; SSE2: subss %xmm0, %xmm1
-; AVX: vsubss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fmul <4 x float> %b, %a
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_mul_ss
-; SSE2: mulss %xmm0, %xmm1
-; AVX: vmulss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) {
- %1 = fdiv <4 x float> %b, %a
- %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
- ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_div_ss
-; SSE2: divss %xmm0, %xmm1
-; AVX: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fadd <2 x double> %b, %a
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_add_sd
-; SSE2: addsd %xmm0, %xmm1
-; AVX: vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fsub <2 x double> %b, %a
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_sub_sd
-; SSE2: subsd %xmm0, %xmm1
-; AVX: vsubsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fmul <2 x double> %b, %a
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_mul_sd
-; SSE2: mulsd %xmm0, %xmm1
-; AVX: vmulsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) {
- %1 = fdiv <2 x double> %b, %a
- %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
- ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_div_sd
-; SSE2: divsd %xmm0, %xmm1
-; AVX: vdivsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 3949a83..b122ef6 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,13 +1,23 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
+; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
+
+target triple = "x86_64-unknown-unknown"
; Ensure that the backend no longer emits unnecessary vector insert
; instructions immediately after SSE scalar fp instructions
; like addss or mulss.
-
define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%add = fadd float %2, %1
@@ -15,14 +25,16 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_add_ss
-; SSE2: addss %xmm1, %xmm0
-; AVX: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%sub = fsub float %2, %1
@@ -30,13 +42,16 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_sub_ss
-; SSE2: subss %xmm1, %xmm0
-; AVX: vsubss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%mul = fmul float %2, %1
@@ -44,14 +59,16 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_mul_ss
-; SSE2: mulss %xmm1, %xmm0
-; AVX: vmulss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%div = fdiv float %2, %1
@@ -59,14 +76,16 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_div_ss
-; SSE2: divss %xmm1, %xmm0
-; AVX: vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
%2 = extractelement <2 x double> %a, i32 0
%add = fadd double %2, %1
@@ -74,14 +93,16 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test_add_sd
-; SSE2: addsd %xmm1, %xmm0
-; AVX: vaddsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
%2 = extractelement <2 x double> %a, i32 0
%sub = fsub double %2, %1
@@ -89,14 +110,16 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test_sub_sd
-; SSE2: subsd %xmm1, %xmm0
-; AVX: vsubsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
%2 = extractelement <2 x double> %a, i32 0
%mul = fmul double %2, %1
@@ -104,14 +127,16 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test_mul_sd
-; SSE2: mulsd %xmm1, %xmm0
-; AVX: vmulsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
%2 = extractelement <2 x double> %a, i32 0
%div = fdiv double %2, %1
@@ -119,14 +144,17 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test_div_sd
-; SSE2: divsd %xmm1, %xmm0
-; AVX: vdivsd %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
%2 = extractelement <4 x float> %b, i32 0
%add = fadd float %1, %2
@@ -134,14 +162,17 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test2_add_ss
-; SSE2: addss %xmm0, %xmm1
-; AVX: vaddss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
%2 = extractelement <4 x float> %b, i32 0
%sub = fsub float %2, %1
@@ -149,14 +180,17 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test2_sub_ss
-; SSE2: subss %xmm0, %xmm1
-; AVX: vsubss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
%2 = extractelement <4 x float> %b, i32 0
%mul = fmul float %1, %2
@@ -164,14 +198,17 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test2_mul_ss
-; SSE2: mulss %xmm0, %xmm1
-; AVX: vmulss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
%2 = extractelement <4 x float> %b, i32 0
%div = fdiv float %2, %1
@@ -179,14 +216,17 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test2_div_ss
-; SSE2: divss %xmm0, %xmm1
-; AVX: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
%2 = extractelement <2 x double> %b, i32 0
%add = fadd double %1, %2
@@ -194,14 +234,17 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test2_add_sd
-; SSE2: addsd %xmm0, %xmm1
-; AVX: vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
%2 = extractelement <2 x double> %b, i32 0
%sub = fsub double %2, %1
@@ -209,14 +252,17 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test2_sub_sd
-; SSE2: subsd %xmm0, %xmm1
-; AVX: vsubsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
%2 = extractelement <2 x double> %b, i32 0
%mul = fmul double %1, %2
@@ -224,14 +270,17 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test2_mul_sd
-; SSE2: mulsd %xmm0, %xmm1
-; AVX: vmulsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
%2 = extractelement <2 x double> %b, i32 0
%div = fdiv double %2, %1
@@ -239,14 +288,18 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %3
}
-; CHECK-LABEL: test2_div_sd
-; SSE2: divsd %xmm0, %xmm1
-; AVX: vdivsd %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm0, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_multiple_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%add = fadd float %2, %1
@@ -255,14 +308,19 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_multiple_add_ss
-; CHECK: addss
-; CHECK: addss
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: subss %xmm1, %xmm2
+; SSE-NEXT: subss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_multiple_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%sub = fsub float %2, %1
@@ -271,14 +329,18 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_multiple_sub_ss
-; CHECK: subss
-; CHECK: subss
-; CHECK-NOT: movss
-; CHECK: ret
-
-
define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm0, %xmm1
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_multiple_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%mul = fmul float %2, %1
@@ -287,13 +349,19 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_multiple_mul_ss
-; CHECK: mulss
-; CHECK: mulss
-; CHECK-NOT: movss
-; CHECK: ret
-
define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: divss %xmm1, %xmm2
+; SSE-NEXT: divss %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_multiple_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
%2 = extractelement <4 x float> %a, i32 0
%div = fdiv float %2, %1
@@ -302,9 +370,501 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
-; CHECK-LABEL: test_multiple_div_ss
-; CHECK: divss
-; CHECK: divss
-; CHECK-NOT: movss
-; CHECK: ret
+; Ensure that the backend selects SSE/AVX scalar fp instructions
+; from a packed fp instrution plus a vector insert.
+
+define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <4 x float> %a, %b
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <4 x float> %a, %b
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <4 x float> %a, %b
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <4 x float> %a, %b
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <2 x double> %a, %b
+ %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <2 x double> %a, %b
+ %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <2 x double> %a, %b
+ %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <2 x double> %a, %b
+ %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <4 x float> %b, %a
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <4 x float> %b, %a
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <4 x float> %b, %a
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <4 x float> %b, %a
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <2 x double> %b, %a
+ %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <2 x double> %b, %a
+ %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <2 x double> %b, %a
+ %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test2_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <2 x double> %b, %a
+ %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %2
+}
+
+define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test3_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fadd <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fsub <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fmul <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_test4_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fdiv <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 183297e..fd35e75 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -1,17 +1,6 @@
; Tests for SSE1 and below, without SSE2+.
-; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s
-
-define <8 x i16> @test1(<8 x i32> %a) nounwind {
-; CHECK: test1
- ret <8 x i16> zeroinitializer
-}
-
-define <8 x i16> @test2(<8 x i32> %a) nounwind {
-; CHECK: test2
- %c = trunc <8 x i32> %a to <8 x i16> ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %c
-}
+; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s
; PR7993
;define <4 x i32> @test3(<4 x i16> %a) nounwind {
@@ -23,6 +12,15 @@ define <8 x i16> @test2(<8 x i32> %a) nounwind {
; vector that this ends up returning.
; rdar://8368414
define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movaps %xmm0, %xmm2
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; CHECK-NEXT: addss %xmm1, %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT: subss %xmm1, %xmm2
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT: ret
entry:
%tmp7 = extractelement <2 x float> %A, i32 0
%tmp5 = extractelement <2 x float> %A, i32 1
@@ -33,15 +31,6 @@ entry:
%tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
%tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
ret <2 x float> %tmp9
-; CHECK-LABEL: test4:
-; CHECK-NOT: shufps $16
-; CHECK: shufps $1,
-; CHECK-NOT: shufps $16
-; CHECK: shufps $1,
-; CHECK-NOT: shufps $16
-; CHECK: unpcklps
-; CHECK-NOT: shufps $16
-; CHECK: ret
}
; We used to get stuck in type legalization for this example when lowering the
@@ -50,8 +39,9 @@ entry:
; condition operand and widening the resulting vselect for the v4f32 result.
; PR18036
-; CHECK-LABEL: vselect
define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
+; CHECK-LABEL: vselect:
+; CHECK: ret
entry:
%a1 = icmp eq <4 x i32> %q, zeroinitializer
%a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
deleted file mode 100644
index c63ff72..0000000
--- a/test/CodeGen/X86/sse2-blend.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
-
-; CHECK-LABEL: vsel_float
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
-define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
- %A = load <4 x float>* %v1
- %B = load <4 x float>* %v2
- %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
- store <4 x float > %vsel, <4 x float>* %v1
- ret void
-}
-
-; CHECK-LABEL: vsel_i32
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
-define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
- %A = load <4 x i32>* %v1
- %B = load <4 x i32>* %v2
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
- store <4 x i32 > %vsel, <4 x i32>* %v1
- ret void
-}
-
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK-LABEL: vsel_i64
-; CHECK: andnps
-; CHECK: orps
-; CHECK: ret
-
-define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
- %A = load <2 x i64>* %v1
- %B = load <2 x i64>* %v2
- %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B
- store <2 x i64 > %vsel, <2 x i64>* %v1
- ret void
-}
-
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK-LABEL: vsel_double
-; CHECK: andnps
-; CHECK: orps
-; CHECK: ret
-
-define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) {
- %A = load <2 x double>* %v1
- %B = load <2 x double>* %v2
- %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B
- store <2 x double > %vsel, <2 x double>* %v1
- ret void
-}
-
-
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index c906ecd..c4d9e6d 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -408,21 +408,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
- ; CHECK: pslldq
- %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
- ; CHECK: pslldq
- %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+ ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+ ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
@@ -504,21 +504,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
- ; CHECK: psrldq
- %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
- ; CHECK: psrldq
- %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+ ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+ ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll
deleted file mode 100644
index e066368..0000000
--- a/test/CodeGen/X86/sse2-mul.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
-
-define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) {
- %m = mul <4 x i32> %x, %y
- ret <4 x i32> %m
-; CHECK-LABEL: test1:
-; CHECK: pshufd $49
-; CHECK: pmuludq
-; CHECK: pshufd $49
-; CHECK: pmuludq
-; CHECK: shufps $-120
-; CHECK: pshufd $-40
-; CHECK: ret
-}
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index e8d3d6f..b7db6cb 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -2,39 +2,48 @@
; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movapd (%ecx), %xmm0
+; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movapd %xmm0, (%eax)
+; CHECK-NEXT: retl
%tmp3 = load <2 x double>* %A, align 16
%tmp7 = insertelement <2 x double> undef, double %B, i32 0
%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
store <2 x double> %tmp9, <2 x double>* %r, align 16
ret void
-
-; CHECK-LABEL: test1:
-; CHECK: movl 4(%esp), %eax
-; CHECK-NEXT: movl 8(%esp), %ecx
-; CHECK-NEXT: movapd (%ecx), %xmm0
-; CHECK-NEXT: movlpd 12(%esp), %xmm0
-; CHECK-NEXT: movapd %xmm0, (%eax)
-; CHECK-NEXT: ret
}
define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movapd (%ecx), %xmm0
+; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movapd %xmm0, (%eax)
+; CHECK-NEXT: retl
%tmp3 = load <2 x double>* %A, align 16
%tmp7 = insertelement <2 x double> undef, double %B, i32 0
%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
store <2 x double> %tmp9, <2 x double>* %r, align 16
ret void
-
-; CHECK-LABEL: test2:
-; CHECK: movl 4(%esp), %eax
-; CHECK: movl 8(%esp), %ecx
-; CHECK-NEXT: movapd (%ecx), %xmm0
-; CHECK-NEXT: movhpd 12(%esp), %xmm0
-; CHECK-NEXT: movapd %xmm0, (%eax)
-; CHECK-NEXT: ret
}
define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movaps (%edx), %xmm0
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: movaps %xmm0, (%eax)
+; CHECK-NEXT: retl
%tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2]
%tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2]
%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
@@ -47,24 +56,30 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
store <4 x float> %tmp13, <4 x float>* %res
ret void
-; CHECK: @test3
-; CHECK: unpcklps
}
define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; CHECK-NEXT: movaps %xmm0, (%eax)
+; CHECK-NEXT: retl
%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
store <4 x float> %tmp5, <4 x float>* %res
ret void
-; CHECK: @test4
-; CHECK: pshufd $50, %xmm0, %xmm0
}
define <4 x i32> @test5(i8** %ptr) nounwind {
; CHECK-LABEL: test5:
-; CHECK: pxor
-; CHECK: punpcklbw
-; CHECK: punpcklwd
-
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl (%eax), %eax
+; CHECK-NEXT: movss (%eax), %xmm1
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retl
%tmp = load i8** %ptr ; <i8*> [#uses=1]
%tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
%tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1]
@@ -81,30 +96,39 @@ define <4 x i32> @test5(i8** %ptr) nounwind {
}
define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
- %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1]
- %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp2, <4 x float>* %res
- ret void
-
; CHECK-LABEL: test6:
-; CHECK: movaps (%ecx), %xmm0
-; CHECK: movaps %xmm0, (%eax)
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movaps (%ecx), %xmm0
+; CHECK-NEXT: movaps %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1]
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
+ store <4 x float> %tmp2, <4 x float>* %res
+ ret void
}
define void @test7() nounwind {
- bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
- shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
- store <4 x float> %2, <4 x float>* null
- ret void
-
; CHECK-LABEL: test7:
-; CHECK: xorps %xmm0, %xmm0
-; CHECK: movaps %xmm0, 0
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps %xmm0, 0
+; CHECK-NEXT: retl
+ bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
+ shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
+ store <4 x float> %2, <4 x float>* null
+ ret void
}
@x = external global [4 x i32]
define <2 x i64> @test8() nounwind {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax
+; CHECK-NEXT: movups (%eax), %xmm0
+; CHECK-NEXT: retl
%tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
%tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
%tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
@@ -115,90 +139,123 @@ define <2 x i64> @test8() nounwind {
%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp16
-; CHECK-LABEL: test8:
-; CHECK: movups (%eax), %xmm0
}
define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: retl
%tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
ret <4 x float> %tmp13
-; CHECK-LABEL: test9:
-; CHECK: movups 8(%esp), %xmm0
}
define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: retl
%tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
ret <4 x float> %tmp13
-; CHECK-LABEL: test10:
-; CHECK: movaps 4(%esp), %xmm0
}
define <2 x double> @test11(double %a, double %b) nounwind {
+; CHECK-LABEL: test11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: retl
%tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
ret <2 x double> %tmp7
-; CHECK-LABEL: test11:
-; CHECK: movaps 4(%esp), %xmm0
}
define void @test12() nounwind {
- %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2]
- %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
- %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
- %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp4, <4 x float>* null
- ret void
; CHECK-LABEL: test12:
-; CHECK: movhlps
-; CHECK: shufps
+; CHECK: ## BB#0:
+; CHECK-NEXT: movapd 0, %xmm0
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT: movsd %xmm0, %xmm1
+; CHECK-NEXT: xorpd %xmm2, %xmm2
+; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; CHECK-NEXT: addps %xmm1, %xmm0
+; CHECK-NEXT: movaps %xmm0, 0
+; CHECK-NEXT: retl
+ %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2]
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
+ %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
+ store <4 x float> %tmp4, <4 x float>* null
+ ret void
}
define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
- %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1]
- %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1]
- %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp11, <4 x float>* %res
- ret void
-; CHECK: test13
-; CHECK: shufps $69, (%ecx), %xmm0
-; CHECK: pshufd $-40, %xmm0, %xmm0
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movaps (%edx), %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT: movaps %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1]
+ %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1]
+ %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
+ store <4 x float> %tmp11, <4 x float>* %res
+ ret void
}
define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
- %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2]
- %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2]
- %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
- %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
- %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp27
; CHECK-LABEL: test14:
-; CHECK: addps [[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
-; CHECK: subps [[X1]], [[X2:%xmm[0-9]+]]
-; CHECK: movlhps [[X2]], [[X0]]
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movaps (%ecx), %xmm1
+; CHECK-NEXT: movaps (%eax), %xmm2
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: addps %xmm1, %xmm0
+; CHECK-NEXT: subps %xmm1, %xmm2
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT: retl
+ %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2]
+ %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2]
+ %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
+ %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
+ %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
+ ret <4 x float> %tmp27
}
define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
-entry:
- %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1]
- %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1]
- %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp4
; CHECK-LABEL: test15:
-; CHECK: movhlps %xmm1, %xmm0
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movapd (%ecx), %xmm0
+; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT: retl
+entry:
+ %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1]
+ %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1]
+ %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
+ ret <4 x float> %tmp4
}
; PR8900
-; CHECK-LABEL: test16:
-; CHECK: unpcklpd
-; CHECK: ret
define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
+; CHECK-LABEL: test16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movapd 96(%eax), %xmm0
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: retl
%i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
%i6 = load <4 x double>* %i5, align 32
%i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
@@ -207,6 +264,11 @@ define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap
; PR9009
define fastcc void @test17() nounwind {
+; CHECK-LABEL: test17:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
+; CHECK-NEXT: movaps %xmm0, (%eax)
+; CHECK-NEXT: retl
entry:
%0 = insertelement <4 x i32> undef, i32 undef, i32 1
%1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -217,25 +279,48 @@ entry:
; PR9210
define <4 x float> @f(<4 x double>) nounwind {
+; CHECK-LABEL: f:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retl
entry:
%double2float.i = fptrunc <4 x double> %0 to <4 x float>
ret <4 x float> %double2float.i
}
define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
-; CHECK-LABEL: test_insert_64_zext
-; CHECK-NOT: xor
-; CHECK: movq
+; CHECK-LABEL: test_insert_64_zext:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movq %xmm0, %xmm0
+; CHECK-NEXT: retl
%1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %1
}
define <4 x i32> @PR19721(<4 x i32> %i) {
+; CHECK-LABEL: PR19721:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: movss %xmm1, %xmm0
+; CHECK-NEXT: retl
%bc = bitcast <4 x i32> %i to i128
%insert = and i128 %bc, -4294967296
%bc2 = bitcast i128 %insert to <4 x i32>
ret <4 x i32> %bc2
+}
-; CHECK-LABEL: PR19721
-; CHECK: punpckldq
+define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_mul:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT: retl
+ %m = mul <4 x i32> %x, %y
+ ret <4 x i32> %m
}
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index b7706cc..5b2de28 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index 8b66743..431588f 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK
; Test ADDSUB ISel patterns.
@@ -141,156 +141,3 @@ define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
; AVX: vaddsubpd
; CHECK-NEXT: ret
-; Functions below are obtained from the following source:
-;
-; float4 test1(float4 A, float4 B) {
-; float4 X = A + B;
-; float4 Y = A - B;
-; return (float4){X[0], Y[1], X[2], Y[3]};
-; }
-;
-; float8 test2(float8 A, float8 B) {
-; float8 X = A + B;
-; float8 Y = A - B;
-; return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};
-; }
-;
-; double4 test3(double4 A, double4 B) {
-; double4 X = A + B;
-; double4 Y = A - B;
-; return (double4){X[0], Y[1], X[2], Y[3]};
-; }
-;
-; double2 test4(double2 A, double2 B) {
-; double2 X = A + B;
-; double2 Y = A - B;
-; return (double2){X[0], Y[1]};
-; }
-
-define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
- %sub = fsub <4 x float> %A, %B
- %add = fadd <4 x float> %A, %B
- %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x float> %vecinit6
-}
-; CHECK-LABEL: test5
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; CHECK: ret
-
-
-define <8 x float> @test6(<8 x float> %A, <8 x float> %B) {
- %sub = fsub <8 x float> %A, %B
- %add = fadd <8 x float> %A, %B
- %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
- ret <8 x float> %vecinit14
-}
-; CHECK-LABEL: test6
-; SSE: xorps
-; SSE-NEXT: addsubps
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; AVX-NOT: vxorps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
-
-define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
- %sub = fsub <4 x double> %A, %B
- %add = fadd <4 x double> %A, %B
- %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x double> %vecinit6
-}
-; CHECK-LABEL: test7
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; AVX-NOT: vxorpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
-
-define <2 x double> @test8(<2 x double> %A, <2 x double> %B) #0 {
- %add = fadd <2 x double> %A, %B
- %sub = fsub <2 x double> %A, %B
- %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %vecinit2
-}
-; CHECK-LABEL: test8
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; CHECK: ret
-
-
-define <4 x float> @test5b(<4 x float> %A, <4 x float> %B) {
- %sub = fsub <4 x float> %A, %B
- %add = fadd <4 x float> %B, %A
- %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x float> %vecinit6
-}
-; CHECK-LABEL: test5
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; CHECK: ret
-
-
-define <8 x float> @test6b(<8 x float> %A, <8 x float> %B) {
- %sub = fsub <8 x float> %A, %B
- %add = fadd <8 x float> %B, %A
- %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
- ret <8 x float> %vecinit14
-}
-; CHECK-LABEL: test6
-; SSE: xorps
-; SSE-NEXT: addsubps
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; AVX-NOT: vxorps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
-
-define <4 x double> @test7b(<4 x double> %A, <4 x double> %B) {
- %sub = fsub <4 x double> %A, %B
- %add = fadd <4 x double> %B, %A
- %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
- ret <4 x double> %vecinit6
-}
-; CHECK-LABEL: test7
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; AVX-NOT: vxorpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
-
-define <2 x double> @test8b(<2 x double> %A, <2 x double> %B) #0 {
- %add = fadd <2 x double> %B, %A
- %sub = fsub <2 x double> %A, %B
- %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
- ret <2 x double> %vecinit2
-}
-; CHECK-LABEL: test8
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 18bdcb3..0a5b0ca 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -1,99 +1,120 @@
; These are tests for SSE3 codegen.
-; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \
-; RUN: | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64
; Test for v8xi16 lowering where we extract the first element of the vector and
; placed it in the second element of the result.
define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
+; X64-LABEL: t0:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT: movdqa %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%tmp3 = load <8 x i16>* %old
%tmp6 = shufflevector <8 x i16> %tmp3,
- <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
+ <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
<8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
store <8 x i16> %tmp6, <8 x i16>* %dest
ret void
-
-; X64-LABEL: t0:
-; X64: movdqa (%rsi), %xmm0
-; X64: pslldq $2, %xmm0
-; X64: movdqa %xmm0, (%rdi)
-; X64: ret
}
define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+; X64-LABEL: t1:
+; X64: ## BB#0:
+; X64-NEXT: movdqa (%rdi), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp3
-; X64-LABEL: t1:
-; X64: movdqa (%rdi), %xmm0
-; X64: pinsrw $0, (%rsi), %xmm0
-; X64: ret
}
define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t2:
+; X64: ## BB#0:
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp
-; X64-LABEL: t2:
-; X64: pextrw $1, %xmm1, %eax
-; X64: pinsrw $0, %eax, %xmm0
-; X64: pinsrw $3, %eax, %xmm0
-; X64: ret
}
define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t3:
+; X64: ## BB#0:
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
ret <8 x i16> %tmp
-; X64-LABEL: t3:
-; X64: pextrw $5, %xmm0, %eax
-; X64: pshuflw $44, %xmm0, %xmm0
-; X64: pshufhw $27, %xmm0, %xmm0
-; X64: pinsrw $3, %eax, %xmm0
-; X64: ret
}
define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t4:
+; X64: ## BB#0:
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7]
+; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
ret <8 x i16> %tmp
-; X64-LABEL: t4:
-; X64: pextrw $7, [[XMM0:%xmm[0-9]+]], %eax
-; X64: pshufhw $100, [[XMM0]], [[XMM1:%xmm[0-9]+]]
-; X64: pinsrw $1, %eax, [[XMM1]]
-; X64: pextrw $1, [[XMM0]], %eax
-; X64: pinsrw $4, %eax, %xmm{{[0-9]}}
-; X64: ret
}
define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t5:
+; X64: ## BB#0:
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
ret <8 x i16> %tmp
-; X64: t5:
-; X64: movlhps %xmm1, %xmm0
-; X64: pshufd $114, %xmm0, %xmm0
-; X64: ret
}
define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t6:
+; X64: ## BB#0:
+; X64-NEXT: movss %xmm1, %xmm0
+; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp
-; X64: t6:
-; X64: movss %xmm1, %xmm0
-; X64: ret
}
define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t7:
+; X64: ## BB#0:
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
+; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
ret <8 x i16> %tmp
-; X64: t7:
-; X64: pshuflw $-80, %xmm0, %xmm0
-; X64: pshufhw $-56, %xmm0, %xmm0
-; X64: ret
}
define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
+; X64-LABEL: t8:
+; X64: ## BB#0:
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X64-NEXT: movdqa %xmm0, (%rdi)
+; X64-NEXT: retq
%tmp = load <2 x i64>* %A
%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>
%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0
@@ -115,14 +136,15 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64>
store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
ret void
-; X64: t8:
-; X64: pshuflw $-58, (%rsi), %xmm0
-; X64: pshufhw $-58, %xmm0, %xmm0
-; X64: movdqa %xmm0, (%rdi)
-; X64: ret
}
define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
+; X64-LABEL: t9:
+; X64: ## BB#0:
+; X64-NEXT: movapd (%rdi), %xmm0
+; X64-NEXT: movhpd (%rsi), %xmm0
+; X64-NEXT: movapd %xmm0, (%rdi)
+; X64-NEXT: retq
%tmp = load <4 x float>* %r
%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*
%tmp.upgrd.4 = load double* %tmp.upgrd.3
@@ -139,11 +161,6 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3
store <4 x float> %tmp13, <4 x float>* %r
ret void
-; X64: t9:
-; X64: movaps (%rdi), %xmm0
-; X64: movhps (%rsi), %xmm0
-; X64: movaps %xmm0, (%rdi)
-; X64: ret
}
@@ -154,113 +171,121 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
@g1 = external constant <4 x i32>
@g2 = external constant <4 x i16>
-define internal void @t10() nounwind {
- load <4 x i32>* @g1, align 16
- bitcast <4 x i32> %1 to <8 x i16>
- shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
- bitcast <8 x i16> %3 to <2 x i64>
- extractelement <2 x i64> %4, i32 0
- bitcast i64 %5 to <4 x i16>
- store <4 x i16> %6, <4 x i16>* @g2, align 8
- ret void
-; X64: t10:
-; X64: pextrw $4, [[X0:%xmm[0-9]+]], %e{{..}}
-; X64: pextrw $6, [[X0]], %e{{..}}
-; X64: movlhps [[X0]], [[X0]]
-; X64: pshuflw $8, [[X0]], [[X0]]
-; X64: pinsrw $2, %e{{..}}, [[X0]]
-; X64: pinsrw $3, %e{{..}}, [[X0]]
+define void @t10() nounwind {
+; X64-LABEL: t10:
+; X64: ## BB#0:
+; X64-NEXT: movq _g1@{{.*}}(%rip), %rax
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq _g2@{{.*}}(%rip), %rax
+; X64-NEXT: movq %xmm0, (%rax)
+; X64-NEXT: retq
+ load <4 x i32>* @g1, align 16
+ bitcast <4 x i32> %1 to <8 x i16>
+ shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
+ bitcast <8 x i16> %3 to <2 x i64>
+ extractelement <2 x i64> %4, i32 0
+ bitcast i64 %5 to <4 x i16>
+ store <4 x i16> %6, <4 x i16>* @g2, align 8
+ ret void
}
-
; Pack various elements via shuffles.
define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t11:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; X64-NEXT: retq
entry:
%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
ret <8 x i16> %tmp7
-; X64-LABEL: t11:
-; X64: movd %xmm1, %eax
-; X64: movlhps %xmm0, %xmm0
-; X64: pshuflw $1, %xmm0, %xmm0
-; X64: pinsrw $1, %eax, %xmm0
-; X64: ret
}
-
define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t12:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
ret <8 x i16> %tmp9
-; X64-LABEL: t12:
-; X64: pextrw $3, %xmm1, %eax
-; X64: movlhps %xmm0, %xmm0
-; X64: pshufhw $3, %xmm0, %xmm0
-; X64: pinsrw $5, %eax, %xmm0
-; X64: ret
}
-
define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t13:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
ret <8 x i16> %tmp9
-; X64-LABEL: t13:
-; X64: punpcklqdq %xmm0, %xmm1
-; X64: pextrw $3, %xmm1, %eax
-; X64: pshufhw $12, %xmm1, %xmm0
-; X64: pinsrw $4, %eax, %xmm0
-; X64: ret
}
-
define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t14:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
ret <8 x i16> %tmp9
-; X64-LABEL: t14:
-; X64: punpcklqdq %xmm0, %xmm1
-; X64: pshufhw $8, %xmm1, %xmm0
-; X64: ret
}
-
; FIXME: t15 is worse off from disabling of scheduler 2-address hack.
define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t15:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; X64-NEXT: retq
entry:
- %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
- ret <8 x i16> %tmp8
-; X64: t15:
-; X64: pextrw $7, %xmm0, %eax
-; X64: punpcklqdq %xmm1, %xmm0
-; X64: pshuflw $-128, %xmm0, %xmm0
-; X64: pinsrw $2, %eax, %xmm0
-; X64: ret
+ %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
+ ret <8 x i16> %tmp8
}
-
; Test yonah where we convert a shuffle to pextrw and pinrsw
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
+; X64-LABEL: t16:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: packuswb %xmm0, %xmm0
+; X64-NEXT: retq
entry:
- %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
- %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
- ret <16 x i8> %tmp9
-; X64: t16:
-; X64: pextrw $8, %xmm0, %eax
-; X64: pslldq $2, %xmm0
-; X64: pextrw $1, %xmm0, %ecx
-; X64: movzbl %cl, %ecx
-; X64: orl %eax, %ecx
-; X64: pinsrw $1, %ecx, %xmm0
-; X64: ret
+ %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+ %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+ ret <16 x i8> %tmp9
}
; rdar://8520311
define <4 x i32> @t17() nounwind {
-entry:
; X64-LABEL: t17:
-; X64: movddup (%rax), %xmm0
+; X64: ## BB#0: ## %entry
+; X64-NEXT: movddup (%rax), %xmm0
+; X64-NEXT: andpd {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+entry:
%tmp1 = load <4 x float>* undef, align 16
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
%tmp3 = load <4 x float>* undef, align 16
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
deleted file mode 100644
index 3a48121..0000000
--- a/test/CodeGen/X86/sse41-blend.ll
+++ /dev/null
@@ -1,140 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-
-;CHECK-LABEL: vsel_float:
-;CHECK: blendps
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
- ret <4 x float> %vsel
-}
-
-
-;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
- ret <4 x i8> %vsel
-}
-
-;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
- ret <4 x i16> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i32:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
- ret <4 x i32> %vsel
-}
-
-
-;CHECK-LABEL: vsel_double:
-;CHECK: movsd
-;CHECK: ret
-define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2
- ret <4 x double> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i64:
-;CHECK: movsd
-;CHECK: ret
-define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2
- ret <4 x i64> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i8:
-;CHECK: pblendvb
-;CHECK: ret
-define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
- %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
- ret <16 x i8> %vsel
-}
-
-;; TEST blend + compares
-; CHECK: A
-define <2 x double> @A(<2 x double> %x, <2 x double> %y) {
- ; CHECK: cmplepd
- ; CHECK: blendvpd
- %max_is_x = fcmp oge <2 x double> %x, %y
- %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
- ret <2 x double> %max
-}
-
-; CHECK: B
-define <2 x double> @B(<2 x double> %x, <2 x double> %y) {
- ; CHECK: cmpnlepd
- ; CHECK: blendvpd
- %min_is_x = fcmp ult <2 x double> %x, %y
- %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
- ret <2 x double> %min
-}
-
-; CHECK: float_crash
-define void @float_crash() nounwind {
-entry:
- %merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
- %extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0
- store double %extract214vector_func.i, double addrspace(1)* undef, align 8
- ret void
-}
-
-; If we can figure out a blend has a constant mask, we should emit the
-; blend instruction with an immediate mask
-define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
-; In this case, we emit a simple movss
-; CHECK-LABEL: constant_blendvpd
-; CHECK: movsd
-; CHECK: ret
- %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
- ret <2 x double> %1
-}
-
-define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
-; CHECK-LABEL: constant_blendvps
-; CHECK-NOT: mov
-; CHECK: blendps $7
-; CHECK: ret
- %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
- ret <4 x float> %1
-}
-
-define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
-; CHECK-LABEL: constant_pblendvb:
-; CHECK: movaps
-; CHECK: pblendvb
-; CHECK: ret
- %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
- ret <16 x i8> %1
-}
-
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
-
-;; 2 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_shufflevector_4xfloat
-; CHECK: blendps $6, %xmm1, %xmm0
-; CHECK: ret
-define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
- %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
- ret <4 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_8xi16
-; CHECK: pblendw $134, %xmm1, %xmm0
-; CHECK: ret
-define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) {
- %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
- ret <8 x i16> %1
-}
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..6fab98e
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+; This test works just like the non-upgrade one except that it only checks
+; forms which require auto-upgrading.
+
+define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
+ ; CHECK: blendpd
+ %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
+ ; CHECK: blendps
+ %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
+ ; CHECK: dppd
+ %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
+ ; CHECK: dpps
+ %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
+ ; CHECK: insertps
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+
+define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+ ; CHECK: mpsadbw
+ %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+ ; CHECK: pblendw
+ %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+
+
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 37eff43..5f25a16 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -2,18 +2,18 @@
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK: blendpd
- %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+ %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
; CHECK: blendps
- %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
@@ -34,35 +34,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
; CHECK: dppd
- %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+ %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
; CHECK: dpps
- %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
; CHECK: insertps
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK: mpsadbw
- %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+ %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
@@ -83,10 +83,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK: pblendw
- %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+ %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 6726a3e..d5c6f74 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,30 +1,47 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
@g16 = external global i16
define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
- %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
- ret <4 x i32> %tmp1
; X32-LABEL: pinsrd_1:
-; X32: pinsrd $1, 4(%esp), %xmm0
-
+; X32: ## BB#0:
+; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
; X64-LABEL: pinsrd_1:
-; X64: pinsrd $1, %edi, %xmm0
+; X64: ## BB#0:
+; X64-NEXT: pinsrd $1, %edi, %xmm0
+; X64-NEXT: retq
+ %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
+ ret <4 x i32> %tmp1
}
define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
- %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
- ret <16 x i8> %tmp1
; X32-LABEL: pinsrb_1:
-; X32: pinsrb $1, 4(%esp), %xmm0
-
+; X32: ## BB#0:
+; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
; X64-LABEL: pinsrb_1:
-; X64: pinsrb $1, %edi, %xmm0
+; X64: ## BB#0:
+; X64-NEXT: pinsrb $1, %edi, %xmm0
+; X64-NEXT: retq
+ %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
+ ret <16 x i8> %tmp1
}
-
define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
+; X32-LABEL: pmovsxbd_1:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pmovsxbd (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: pmovsxbd_1:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: pmovsxbd (%rdi), %xmm0
+; X64-NEXT: retq
entry:
%0 = load i32* %p, align 4
%1 = insertelement <4 x i32> undef, i32 %0, i32 0
@@ -35,16 +52,19 @@ entry:
%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
%7 = bitcast <4 x i32> %6 to <2 x i64>
ret <2 x i64> %7
-
-; X32: _pmovsxbd_1:
-; X32: movl 4(%esp), %eax
-; X32: pmovsxbd (%eax), %xmm0
-
-; X64: _pmovsxbd_1:
-; X64: pmovsxbd (%rdi), %xmm0
}
define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
+; X32-LABEL: pmovsxwd_1:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pmovsxwd (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: pmovsxwd_1:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: pmovsxwd (%rdi), %xmm0
+; X64-NEXT: retq
entry:
%0 = load i64* %p ; <i64> [#uses=1]
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1]
@@ -52,63 +72,59 @@ entry:
%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1]
%3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1]
ret <2 x i64> %3
-
-; X32: _pmovsxwd_1:
-; X32: movl 4(%esp), %eax
-; X32: pmovsxwd (%eax), %xmm0
-
-; X64: _pmovsxwd_1:
-; X64: pmovsxwd (%rdi), %xmm0
}
-
-
-
define <2 x i64> @pmovzxbq_1() nounwind {
+; X32-LABEL: pmovzxbq_1:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl L_g16$non_lazy_ptr, %eax
+; X32-NEXT: pmovzxbq (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: pmovzxbq_1:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: movq _g16@{{.*}}(%rip), %rax
+; X64-NEXT: pmovzxbq (%rax), %xmm0
+; X64-NEXT: retq
entry:
%0 = load i16* @g16, align 2 ; <i16> [#uses=1]
%1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
%2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %3
-
-; X32: _pmovzxbq_1:
-; X32: movl L_g16$non_lazy_ptr, %eax
-; X32: pmovzxbq (%eax), %xmm0
-
-; X64: _pmovzxbq_1:
-; X64: movq _g16@GOTPCREL(%rip), %rax
-; X64: pmovzxbq (%rax), %xmm0
}
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-
define i32 @extractps_1(<4 x float> %v) nounwind {
+; X32-LABEL: extractps_1:
+; X32: ## BB#0:
+; X32-NEXT: extractps $3, %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: extractps_1:
+; X64: ## BB#0:
+; X64-NEXT: extractps $3, %xmm0, %eax
+; X64-NEXT: retq
%s = extractelement <4 x float> %v, i32 3
%i = bitcast float %s to i32
ret i32 %i
-
-; X32: _extractps_1:
-; X32: extractps $3, %xmm0, %eax
-
-; X64: _extractps_1:
-; X64: extractps $3, %xmm0, %eax
}
define i32 @extractps_2(<4 x float> %v) nounwind {
+; X32-LABEL: extractps_2:
+; X32: ## BB#0:
+; X32-NEXT: extractps $3, %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: extractps_2:
+; X64: ## BB#0:
+; X64-NEXT: extractps $3, %xmm0, %eax
+; X64-NEXT: retq
%t = bitcast <4 x float> %v to <4 x i32>
%s = extractelement <4 x i32> %t, i32 3
ret i32 %s
-
-; X32: _extractps_2:
-; X32: extractps $3, %xmm0, %eax
-
-; X64: _extractps_2:
-; X64: extractps $3, %xmm0, %eax
}
@@ -117,106 +133,152 @@ define i32 @extractps_2(<4 x float> %v) nounwind {
; is bitcasted to i32, but unsuitable for much of anything else.
define float @ext_1(<4 x float> %v) nounwind {
+; X32-LABEL: ext_1:
+; X32: ## BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-NEXT: addss LCPI7_0, %xmm0
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ext_1:
+; X64: ## BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT: addss {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
%s = extractelement <4 x float> %v, i32 3
%t = fadd float %s, 1.0
ret float %t
-
-; X32: _ext_1:
-; X32: pshufd $3, %xmm0, %xmm0
-; X32: addss LCPI7_0, %xmm0
-
-; X64: _ext_1:
-; X64: pshufd $3, %xmm0, %xmm0
-; X64: addss LCPI7_0(%rip), %xmm0
}
define float @ext_2(<4 x float> %v) nounwind {
+; X32-LABEL: ext_2:
+; X32: ## BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ext_2:
+; X64: ## BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT: retq
%s = extractelement <4 x float> %v, i32 3
ret float %s
-
-; X32: _ext_2:
-; X32: pshufd $3, %xmm0, %xmm0
-
-; X64: _ext_2:
-; X64: pshufd $3, %xmm0, %xmm0
}
define i32 @ext_3(<4 x i32> %v) nounwind {
+; X32-LABEL: ext_3:
+; X32: ## BB#0:
+; X32-NEXT: pextrd $3, %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ext_3:
+; X64: ## BB#0:
+; X64-NEXT: pextrd $3, %xmm0, %eax
+; X64-NEXT: retq
%i = extractelement <4 x i32> %v, i32 3
ret i32 %i
-
-; X32: _ext_3:
-; X32: pextrd $3, %xmm0, %eax
-
-; X64: _ext_3:
-; X64: pextrd $3, %xmm0, %eax
}
define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
- %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
- ret <4 x float> %tmp1
-; X32: _insertps_1:
-; X32: insertps $1, %xmm1, %xmm0
-
-; X64: _insertps_1:
-; X64: insertps $1, %xmm1, %xmm0
+; X32-LABEL: insertps_1:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_1:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X64-NEXT: retq
+ %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
+ ret <4 x float> %tmp1
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
- %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
- ret <4 x float> %tmp1
-; X32: _insertps_2:
-; X32: insertps $0, 4(%esp), %xmm0
-
-; X64: _insertps_2:
-; X64: insertps $0, %xmm1, %xmm0
+; X32-LABEL: insertps_2:
+; X32: ## BB#0:
+; X32-NEXT: insertps $0, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_2:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
+ ret <4 x float> %tmp1
}
-
define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
- %tmp2 = extractelement <4 x float> %t2, i32 0
- %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
- ret <4 x float> %tmp1
-; X32: _insertps_3:
-; X32: insertps $0, %xmm1, %xmm0
-
-; X64: _insertps_3:
-; X64: insertps $0, %xmm1, %xmm0
+; X32-LABEL: insertps_3:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_3:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %tmp2 = extractelement <4 x float> %t2, i32 0
+ %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
+ ret <4 x float> %tmp1
}
define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
- %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
- ret i32 %tmp1
-; X32: _ptestz_1:
-; X32: ptest %xmm1, %xmm0
-; X32: sete %al
-
-; X64: _ptestz_1:
-; X64: ptest %xmm1, %xmm0
-; X64: sete %al
+; X32-LABEL: ptestz_1:
+; X32: ## BB#0:
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ptestz_1:
+; X64: ## BB#0:
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+ ret i32 %tmp1
}
define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
- %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
- ret i32 %tmp1
-; X32: _ptestz_2:
-; X32: ptest %xmm1, %xmm0
-; X32: sbbl %eax
-
-; X64: _ptestz_2:
-; X64: ptest %xmm1, %xmm0
-; X64: sbbl %eax
+; X32-LABEL: ptestz_2:
+; X32: ## BB#0:
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ptestz_2:
+; X64: ## BB#0:
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+ ret i32 %tmp1
}
define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
- %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
- ret i32 %tmp1
-; X32: _ptestz_3:
-; X32: ptest %xmm1, %xmm0
-; X32: seta %al
-
-; X64: _ptestz_3:
-; X64: ptest %xmm1, %xmm0
-; X64: seta %al
+; X32-LABEL: ptestz_3:
+; X32: ## BB#0:
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ptestz_3:
+; X64: ## BB#0:
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+ ret i32 %tmp1
}
@@ -227,6 +289,25 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
; This used to compile to insertps $0 + insertps $16. insertps $0 is always
; pointless.
define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
+; X32-LABEL: buildvector:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movaps %xmm0, %xmm2
+; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X32-NEXT: addss %xmm1, %xmm0
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT: addss %xmm2, %xmm1
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: buildvector:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: movaps %xmm0, %xmm2
+; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X64-NEXT: addss %xmm1, %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT: addss %xmm2, %xmm1
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-NEXT: retq
entry:
%tmp7 = extractelement <2 x float> %A, i32 0
%tmp5 = extractelement <2 x float> %A, i32 1
@@ -237,97 +318,124 @@ entry:
%tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
%tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
ret <2 x float> %tmp9
-; X32-LABEL: buildvector:
-; X32-NOT: insertps $0
-; X32: insertps $16
-; X32-NOT: insertps $0
-; X32: ret
-; X64-LABEL: buildvector:
-; X64-NOT: insertps $0
-; X64: insertps $16
-; X64-NOT: insertps $0
-; X64: ret
}
define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_shufflevector_1:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: insertps $48, (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_shufflevector_1:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: insertps $48, (%rdi), %xmm0
+; X64-NEXT: retq
entry:
%0 = load <4 x float>* %pb, align 16
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x float> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_1:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: insertps $48,
-; CHECK: ret
}
define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+; X32-LABEL: insertps_from_shufflevector_2:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_shufflevector_2:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-NEXT: retq
entry:
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
ret <4 x float> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_2:
-; CHECK-NOT: shufps
-; CHECK: insertps $96,
-; CHECK: ret
}
; For loading an i32 from memory into an xmm register we use pinsrd
; instead of insertps
define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+; X32-LABEL: pinsrd_from_shufflevector_i32:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: insertps $48, (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: pinsrd_from_shufflevector_i32:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: insertps $48, (%rdi), %xmm0
+; X64-NEXT: retq
entry:
%0 = load <4 x i32>* %pb, align 16
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %vecinit6
-; CHECK-LABEL: pinsrd_from_shufflevector_i32:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: pinsrd $3,
-; CHECK: ret
}
define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+; X32-LABEL: insertps_from_shufflevector_i32_2:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_shufflevector_i32_2:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X64-NEXT: retq
entry:
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
ret <4 x i32> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_i32_2:
-; CHECK-NOT: shufps
-; CHECK-NOT: movaps
-; CHECK: insertps $208,
-; CHECK: ret
}
define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
-; CHECK-LABEL: insertps_from_load_ins_elt_undef:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: insertps $16,
-; CHECK: ret
+; X32-LABEL: insertps_from_load_ins_elt_undef:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: insertps $16, (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_load_ins_elt_undef:
+; X64: ## BB#0:
+; X64-NEXT: insertps $16, (%rdi), %xmm0
+; X64-NEXT: retq
%1 = load float* %b, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x float> %result
}
-define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
-; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
-;; aCHECK-NOT: movd
-; CHECK-NOT: shufps
-; CHECK: insertps $32,
-; CHECK: ret
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd (%eax), %xmm1
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
+; X64: ## BB#0:
+; X64-NEXT: movd (%rdi), %xmm1
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X64-NEXT: retq
%1 = load i32* %b, align 4
%2 = insertelement <4 x i32> undef, i32 %1, i32 0
%result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
ret <4 x i32> %result
}
-;;;;;; Shuffles optimizable with a single insertps instruction
+;;;;;; Shuffles optimizable with a single insertps or blend instruction
define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYZ0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $8
-; CHECK: ret
+; X32-LABEL: shuf_XYZ0:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_XYZ0:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecext1 = extractelement <4 x float> %x, i32 1
@@ -339,11 +447,15 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XY00:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $12
-; CHECK: ret
+; X32-LABEL: shuf_XY00:
+; X32: ## BB#0:
+; X32-NEXT: movq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_XY00:
+; X64: ## BB#0:
+; X64-NEXT: movq %xmm0, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecext1 = extractelement <4 x float> %x, i32 1
@@ -354,11 +466,15 @@ define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYY0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $104
-; CHECK: ret
+; X32-LABEL: shuf_XYY0:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_XYY0:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecext1 = extractelement <4 x float> %x, i32 1
@@ -369,9 +485,15 @@ define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYW0:
-; CHECK: insertps $232
-; CHECK: ret
+; X32-LABEL: shuf_XYW0:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_XYW0:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecext1 = extractelement <4 x float> %x, i32 1
@@ -383,11 +505,15 @@ define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_W00W:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $198
-; CHECK: ret
+; X32-LABEL: shuf_W00W:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_W00W:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 3
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -397,11 +523,19 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X00A:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps $48
-; CHECK: ret
+; X32-LABEL: shuf_X00A:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm2, %xmm2
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_X00A:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -411,11 +545,21 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X00X:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps $48
-; CHECK: ret
+; X32-LABEL: shuf_X00X:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_X00X:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -425,12 +569,23 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
}
define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X0YC:
-; CHECK: shufps
-; CHECK-NOT: movhlps
-; CHECK-NOT: shufps
-; CHECK: insertps $176
-; CHECK: ret
+; X32-LABEL: shuf_X0YC:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm2, %xmm2
+; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
+; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shuf_X0YC:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
+; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -440,11 +595,17 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
}
define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYZ0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $8
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYZ0:
+; X32: ## BB#0:
+; X32-NEXT: pxor %xmm1, %xmm1
+; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_XYZ0:
+; X64: ## BB#0:
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecext1 = extractelement <4 x i32> %x, i32 1
@@ -456,11 +617,15 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XY00:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $12
-; CHECK: ret
+; X32-LABEL: i32_shuf_XY00:
+; X32: ## BB#0:
+; X32-NEXT: movq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_XY00:
+; X64: ## BB#0:
+; X64-NEXT: movq %xmm0, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecext1 = extractelement <4 x i32> %x, i32 1
@@ -471,11 +636,15 @@ define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYY0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $104
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYY0:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_XYY0:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecext1 = extractelement <4 x i32> %x, i32 1
@@ -486,11 +655,15 @@ define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYW0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $232
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYW0:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_XYW0:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecext1 = extractelement <4 x i32> %x, i32 1
@@ -502,11 +675,15 @@ define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_W00W:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps $198
-; CHECK: ret
+; X32-LABEL: i32_shuf_W00W:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_W00W:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 3
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -516,11 +693,19 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X00A:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps $48
-; CHECK: ret
+; X32-LABEL: i32_shuf_X00A:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm2, %xmm2
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_X00A:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -530,11 +715,21 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X00X:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps $48
-; CHECK: ret
+; X32-LABEL: i32_shuf_X00X:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_X00X:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -544,12 +739,23 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
}
define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X0YC:
-; CHECK: shufps
-; CHECK-NOT: movhlps
-; CHECK-NOT: shufps
-; CHECK: insertps $176
-; CHECK: ret
+; X32-LABEL: i32_shuf_X0YC:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm2, %xmm2
+; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
+; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: i32_shuf_X0YC:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
+; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
%vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -560,11 +766,19 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
;; Test for a bug in the first implementation of LowerBuildVectorv4x32
define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
-; CHECK-LABEL: test_insertps_no_undef:
-; CHECK: movaps %xmm0, %xmm1
-; CHECK-NEXT: insertps $8, %xmm1, %xmm1
-; CHECK-NEXT: maxps %xmm1, %xmm0
-; CHECK-NEXT: ret
+; X32-LABEL: test_insertps_no_undef:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
+; X32-NEXT: maxps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_insertps_no_undef:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
+; X64-NEXT: maxps %xmm1, %xmm0
+; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecext1 = extractelement <4 x float> %x, i32 1
@@ -578,48 +792,75 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
}
define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: blendvb_fallback
-; CHECK: blendvb
-; CHECK: ret
+; X32-LABEL: blendvb_fallback:
+; X32: ## BB#0:
+; X32-NEXT: psllw $15, %xmm0
+; X32-NEXT: psraw $15, %xmm0
+; X32-NEXT: pblendvb %xmm1, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: blendvb_fallback:
+; X64: ## BB#0:
+; X64-NEXT: psllw $15, %xmm0
+; X64-NEXT: psraw $15, %xmm0
+; X64-NEXT: pblendvb %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: retq
%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
ret <8 x i16> %ret
}
-define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; CHECK-LABEL: insertps_from_vector_load:
; On X32, account for the argument's move to registers
-; X32: movl 4(%esp), %eax
-; CHECK-NOT: mov
-; CHECK: insertps $48
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_vector_load:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: insertps $48, (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_vector_load:
+; X64: ## BB#0:
+; X64-NEXT: insertps $48, (%rdi), %xmm0
+; X64-NEXT: retq
%1 = load <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
ret <4 x float> %2
}
;; Use a non-zero CountS for insertps
-define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; CHECK-LABEL: insertps_from_vector_load_offset:
-; On X32, account for the argument's move to registers
-; X32: movl 4(%esp), %eax
-; CHECK-NOT: mov
;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps $96, 4(%{{...}}), %
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_vector_load_offset:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: insertps $96, 4(%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_vector_load_offset:
+; X64: ## BB#0:
+; X64-NEXT: insertps $96, 4(%rdi), %xmm0
+; X64-NEXT: retq
%1 = load <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
ret <4 x float> %2
}
-define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
-; CHECK-LABEL: insertps_from_vector_load_offset_2:
-; On X32, account for the argument's move to registers
-; X32: movl 4(%esp), %eax
-; X32: movl 8(%esp), %ecx
-; CHECK-NOT: mov
;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps $192, 12(%{{...}},%{{...}}), %
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; X32-LABEL: insertps_from_vector_load_offset_2:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: shll $4, %ecx
+; X32-NEXT: insertps $-64, 12(%eax,%ecx), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_vector_load_offset_2:
+; X64: ## BB#0:
+; X64-NEXT: shlq $4, %rsi
+; X64-NEXT: insertps $-64, 12(%rdi,%rsi), %xmm0
+; X64-NEXT: retq
%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
%2 = load <4 x float>* %1, align 16
%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
@@ -627,13 +868,21 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
}
define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
-; CHECK-LABEL: insertps_from_broadcast_loadf32:
-; On X32, account for the arguments' move to registers
-; X32: movl 8(%esp), %eax
-; X32: movl 4(%esp), %ecx
-; CHECK-NOT: mov
-; CHECK: insertps $48
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_loadf32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss (%ecx,%eax,4), %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_broadcast_loadf32:
+; X64: ## BB#0:
+; X64-NEXT: movss (%rdi,%rsi,4), %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: retq
%1 = getelementptr inbounds float* %fb, i64 %index
%2 = load float* %1, align 4
%3 = insertelement <4 x float> undef, float %2, i32 0
@@ -645,12 +894,20 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
}
define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
-; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
-; On X32, account for the arguments' move to registers
-; X32: movl 4(%esp), %{{...}}
-; CHECK-NOT: mov
-; CHECK: insertps $48
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_loadv4f32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_broadcast_loadv4f32:
+; X64: ## BB#0:
+; X64-NEXT: movups (%rdi), %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: retq
%1 = load <4 x float>* %b, align 4
%2 = extractelement <4 x float> %1, i32 0
%3 = insertelement <4 x float> undef, float %2, i32 0
@@ -663,20 +920,33 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
-; CHECK-LABEL: insertps_from_broadcast_multiple_use:
-; On X32, account for the arguments' move to registers
-; X32: movl 8(%esp), %eax
-; X32: movl 4(%esp), %ecx
-; CHECK: movss
-; CHECK-NOT: mov
-; CHECK: insertps $48
-; CHECK: insertps $48
-; CHECK: insertps $48
-; CHECK: insertps $48
-; CHECK: addps
-; CHECK: addps
-; CHECK: addps
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_multiple_use:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss (%ecx,%eax,4), %xmm4
+; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
+; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X32-NEXT: addps %xmm1, %xmm0
+; X32-NEXT: addps %xmm2, %xmm3
+; X32-NEXT: addps %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_from_broadcast_multiple_use:
+; X64: ## BB#0:
+; X64-NEXT: movss (%rdi,%rsi,4), %xmm4
+; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
+; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT: addps %xmm1, %xmm0
+; X64-NEXT: addps %xmm2, %xmm3
+; X64-NEXT: addps %xmm3, %xmm0
+; X64-NEXT: retq
%1 = getelementptr inbounds float* %fb, i64 %index
%2 = load float* %1, align 4
%3 = insertelement <4 x float> undef, float %2, i32 0
@@ -694,10 +964,20 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
}
define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
-; CHECK-LABEL: insertps_with_undefs:
-; CHECK-NOT: shufps
-; CHECK: insertps $32, %xmm0
-; CHECK: ret
+; X32-LABEL: insertps_with_undefs:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss (%eax), %xmm1
+; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_with_undefs:
+; X64: ## BB#0:
+; X64-NEXT: movss (%rdi), %xmm1
+; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
%1 = load float* %b, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
@@ -707,10 +987,162 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
; the destination index to change the load, instead of the source index.
define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
-; CHECK-LABEL: pr20087:
-; CHECK: insertps $48
-; CHECK: ret
+; X32-LABEL: pr20087:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: insertps $-78, 8(%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: pr20087:
+; X64: ## BB#0:
+; X64-NEXT: insertps $-78, 8(%rdi), %xmm0
+; X64-NEXT: retq
%load = load <4 x float> *%ptr
%ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
ret <4 x float> %ret
}
+
+; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
+define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+; X32-LABEL: insertps_pr20411:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; X32-NEXT: insertps $-36, LCPI49_1+12, %xmm0
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_pr20411:
+; X64: ## BB#0:
+; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; X64-NEXT: insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; 4 5 6 7
+ %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
+ %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
+ %ptrcast = bitcast i32* %RET to <4 x i32>*
+ store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
+ ret void
+}
+
+define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_4:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_4:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; X64-NEXT: retq
+entry:
+ %vecext = extractelement <4 x float> %A, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+ %vecext2 = extractelement <4 x float> %B, i32 2
+ %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_5:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_5:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
+; X64-NEXT: retq
+entry:
+ %vecext = extractelement <4 x float> %A, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %B, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_6:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_6:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
+; X64-NEXT: retq
+entry:
+ %vecext = extractelement <4 x float> %A, i32 1
+ %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
+ %vecext1 = extractelement <4 x float> %B, i32 2
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit3
+}
+
+define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_7:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_7:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
+; X64-NEXT: retq
+entry:
+ %vecext = extractelement <4 x float> %A, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+ %vecext2 = extractelement <4 x float> %B, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_8:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_8:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X64-NEXT: retq
+entry:
+ %vecext = extractelement <4 x float> %A, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %B, i32 0
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_9:
+; X32: ## BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: insertps_9:
+; X64: ## BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+entry:
+ %vecext = extractelement <4 x float> %A, i32 0
+ %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
+ %vecext1 = extractelement <4 x float> %B, i32 2
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit3
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index cf88ade..cf0f999 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -10,88 +10,88 @@
; Function Attrs: nounwind sspreq
define i32 @_Z18read_response_sizev() #0 {
entry:
- tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39
+ tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !39
%0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
- tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71
+ tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !71
%1 = trunc i64 %0 to i32
ret i32 %1
}
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
attributes #0 = { sspreq }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!21, !72}
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
!1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"}
!2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !1, metadata !4, null, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\0019\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
!5 = metadata !{}
!6 = metadata !{metadata !7}
-!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
+!7 = metadata !{metadata !"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
!8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
-!10 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", metadata !1, metadata !10, metadata !11, null, i32 ()* @_Z18read_response_sizev, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
+!10 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
!14 = metadata !{metadata !15, metadata !19}
-!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28]
-!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0x100\00b\0028\000", metadata !9, metadata !10, metadata !16} ; [ DW_TAG_auto_variable ] [b] [line 28]
+!16 = metadata !{metadata !"0x13\00B\0016\0032\0032\000\000\000", metadata !1, null, null, metadata !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
!17 = metadata !{metadata !18}
-!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
-!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29]
+!18 = metadata !{metadata !"0xd\00end_of_file\0017\0032\0032\000\000", metadata !1, metadata !16, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
+!19 = metadata !{metadata !"0x100\00c\0029\000", metadata !9, metadata !10, metadata !13} ; [ DW_TAG_auto_variable ] [c] [line 29]
!20 = metadata !{}
!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
!22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
-!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1]
+!23 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!24 = metadata !{metadata !"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", metadata !1, metadata !25, metadata !27, null, null, metadata !33, null, metadata !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
+!25 = metadata !{metadata !"0x39\00__1\001", metadata !26, null} ; [ DW_TAG_namespace ] [__1] [line 1]
!26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"}
-!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!28 = metadata !{metadata !29, metadata !29, metadata !32}
-!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!29 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!31 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!32 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
!33 = metadata !{metadata !34}
-!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!34 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !31, null} ; [ DW_TAG_template_type_parameter ]
!35 = metadata !{metadata !36, metadata !37}
-!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12]
-!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!36 = metadata !{metadata !"0x101\00p1\0016777228\000", metadata !24, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 12]
+!37 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 12]
!38 = metadata !{i32 33, i32 0, metadata !9, null}
!39 = metadata !{i32 12, i32 0, metadata !24, metadata !38}
!40 = metadata !{i32 9, i32 0, metadata !41, metadata !59}
-!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
-!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!41 = metadata !{metadata !"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", metadata !1, metadata !25, metadata !42, null, null, metadata !53, null, metadata !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
+!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44}
-!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
+!44 = metadata !{metadata !"0x13\00A\000\008\008\000\000\000", metadata !1, metadata !25, null, metadata !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
!45 = metadata !{metadata !46}
-!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()]
-!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!46 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", metadata !1, metadata !44, metadata !47, null, null, null, i32 0, metadata !52} ; [ DW_TAG_subprogram ] [line 1] [operator()]
+!47 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50}
-!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!49 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!50 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
!52 = metadata !{i32 786468}
!53 = metadata !{metadata !34, metadata !54}
-!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!54 = metadata !{metadata !"0x2f\00_Compare\000\000", null, metadata !44, null} ; [ DW_TAG_template_type_parameter ]
!55 = metadata !{metadata !56, metadata !57, metadata !58}
-!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7]
-!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7]
-!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8]
+!56 = metadata !{metadata !"0x101\00p1\0016777223\000", metadata !41, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 7]
+!57 = metadata !{metadata !"0x101\00p2\0033554439\000", metadata !41, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 7]
+!58 = metadata !{metadata !"0x101\00p3\0050331656\000", metadata !41, metadata !10, metadata !44} ; [ DW_TAG_arg_variable ] [p3] [line 8]
!59 = metadata !{i32 13, i32 0, metadata !24, metadata !38}
!63 = metadata !{i32 undef}
-!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
+!64 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!65 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", metadata !1, metadata !25, metadata !47, null, null, null, metadata !46, metadata !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
!66 = metadata !{metadata !67, metadata !69, metadata !70}
-!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2]
+!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !65, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!69 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!70 = metadata !{metadata !"0x101\00\0050331650\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [line 2]
!71 = metadata !{i32 1, i32 0, metadata !65, metadata !40}
-!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll
new file mode 100644
index 0000000..dd639a7
--- /dev/null
+++ b/test/CodeGen/X86/stack_guard_remat.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK
+
+;CHECK: foo2
+;CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]]
+;CHECK: movq ([[R0]]), {{%[a-z0-9]+}}
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test_stack_guard_remat() #0 {
+entry:
+ %a1 = alloca [256 x i32], align 16
+ %0 = bitcast [256 x i32]* %a1 to i8*
+ call void @llvm.lifetime.start(i64 1024, i8* %0)
+ %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0
+ call void @foo3(i32* %arraydecay)
+ call void asm sideeffect "foo2", "~{r12},~{r13},~{r14},~{r15},~{ebx},~{esi},~{edi},~{dirflag},~{fpsr},~{flags}"()
+ call void @llvm.lifetime.end(i64 1024, i8* %0)
+ ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
index 0b7e6db..dfb16ad 100644
--- a/test/CodeGen/X86/stackmap-fast-isel.ll
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort | FileCheck %s
; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps
; CHECK-NEXT: __LLVM_StackMaps:
diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
new file mode 100644
index 0000000..73ee4f3
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-large-constants.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; version
+; CHECK-NEXT: .byte 1
+; reserved
+; CHECK-NEXT: .byte 0
+; reserved
+; CHECK-NEXT: .short 0
+; # functions
+; CHECK-NEXT: .long 2
+; # constants
+; CHECK-NEXT: .long 2
+; # records
+; CHECK-NEXT: .long 2
+; function address & stack size
+; CHECK-NEXT: .quad _foo
+; CHECK-NEXT: .quad 8
+; function address & stack size
+; CHECK-NEXT: .quad _bar
+; CHECK-NEXT: .quad 8
+
+; Constants Array:
+; CHECK-NEXT: .quad 9223372036854775807
+; CHECK-NEXT: .quad -9223372036854775808
+
+; Patchpoint ID
+; CHECK-NEXT: .quad 0
+; Instruction offset
+; CHECK-NEXT: .long L{{.*}}-_foo
+; reserved
+; CHECK-NEXT: .short 0
+; # locations
+; CHECK-NEXT: .short 1
+; ConstantIndex
+; CHECK-NEXT: .byte 5
+; reserved
+; CHECK-NEXT: .byte 8
+; Dwarf RegNum
+; CHECK-NEXT: .short 0
+; Offset
+; CHECK-NEXT: .long 0
+; padding
+; CHECK-NEXT: .short 0
+; NumLiveOuts
+; CHECK-NEXT: .short 0
+
+; CHECK-NEXT: .align 3
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+
+define void @foo() {
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
+ ret void
+}
+
+; Patchpoint ID
+; CHECK-NEXT: .quad 0
+; Instruction Offset
+; CHECK-NEXT: .long L{{.*}}-_bar
+; reserved
+; CHECK-NEXT: .short 0
+; # locations
+; CHECK-NEXT: .short 1
+; ConstantIndex
+; CHECK-NEXT: .byte 5
+; reserved
+; CHECK-NEXT: .byte 8
+; Dwarf RegNum
+; CHECK-NEXT: .short 0
+; Offset
+; CHECK-NEXT: .long 1
+; padding
+; CHECK-NEXT: .short 0
+; NumLiveOuts
+; CHECK-NEXT: .short 0
+
+
+define void @bar() {
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
+ ret void
+}
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 897595d..31553c0 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness=false | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck -check-prefix=PATCH %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -enable-patchpoint-liveness=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck -check-prefix=PATCH %s
;
; Note: Print verbose stackmaps using -debug-only=stackmaps.
diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
index 5a78f24..7932c0d 100644
--- a/test/CodeGen/X86/stackmap-nops.ll
+++ b/test/CodeGen/X86/stackmap-nops.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
define void @nop_test() {
entry:
@@ -224,6 +224,10 @@ entry:
tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+; Add an extra stackmap with a zero-length shadow to thwart the shadow
+; optimization. This will force all 15 bytes of the previous shadow to be
+; padded with nops.
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0)
ret void
}
diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll
new file mode 100644
index 0000000..a3725f2
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+
+; Check that the X86 stackmap shadow optimization is only outputting a 3-byte
+; nop here. 8-bytes are requested, but 5 are covered by the code for the call to
+; bar. However, the frame teardown and the return do not count towards the
+; stackmap shadow as the call return counts as a branch target so must flush
+; the shadow.
+; Note that in order for a thread to not return in to the patched space
+; the call must be at the end of the shadow, so the required nop must be
+; before the call, not after.
+define void @shadow_optimization_test() {
+entry:
+; CHECK-LABEL: shadow_optimization_test:
+; CHECK: callq _bar
+; CHECK: nop
+; CHECK: callq _bar
+; CHECK-NOT: nop
+; CHECK: callq _bar
+; CHECK-NOT: nop
+ call void @bar()
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8)
+ call void @bar()
+ call void @bar()
+ ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @bar()
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index 8567037..5e356f3 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
;
; Note: Print verbose stackmaps using -debug-only=stackmaps.
@@ -9,11 +9,11 @@
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 0
; Num Functions
-; CHECK-NEXT: .long 15
+; CHECK-NEXT: .long 16
; Num LargeConstants
; CHECK-NEXT: .long 3
; Num Callsites
-; CHECK-NEXT: .long 19
+; CHECK-NEXT: .long 20
; Functions and stack size
; CHECK-NEXT: .quad _constantargs
@@ -46,6 +46,8 @@
; CHECK-NEXT: .quad 8
; CHECK-NEXT: .quad _clobberScratch
; CHECK-NEXT: .quad 56
+; CHECK-NEXT: .quad _needsStackRealignment
+; CHECK-NEXT: .quad -1
; Large Constants
; CHECK-NEXT: .quad 2147483648
@@ -464,6 +466,23 @@ define void @clobberScratch(i32 %a) {
ret void
}
+; A stack frame which needs to be realigned at runtime (to meet alignment
+; criteria for values on the stack) does not have a fixed frame size.
+; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment
+; CHECK-NEXT: .short 0
+; 0 locations
+; CHECK-NEXT: .short 0
+define void @needsStackRealignment() {
+ %val = alloca i64, i32 3, align 128
+ tail call void (...)* @escape_values(i64* %val)
+; Note: Adding any non-constant to the stackmap would fail because we
+; expected to be able to address off the frame pointer. In a realigned
+; frame, we must use the stack pointer instead. This is a separate bug.
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0)
+ ret void
+}
+declare void @escape_values(...)
+
declare void @llvm.experimental.stackmap(i64, i32, ...)
declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 7557f25..e3cc2fa 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -34,8 +34,8 @@ entry:
; X64: movb %sil, 1(%rdi)
; X32-LABEL: test2:
-; X32: movb 8(%esp), %[[REG:[abcd]l]]
-; X32: movb %[[REG]], 1(%{{.*}})
+; X32: movb 8(%esp), %[[REG:[abcd]]]l
+; X32: movb %[[REG]]l, 1(%{{.*}})
}
define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -67,8 +67,8 @@ entry:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test4:
-; X32: movl 8(%esp), %e[[REG:[abcd]x]]
-; X32: movw %[[REG]], 2(%{{.*}})
+; X32: movw 8(%esp), %[[REG:[abcd]]]x
+; X32: movw %[[REG]]x, 2(%{{.*}})
}
define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -84,8 +84,8 @@ entry:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test5:
-; X32: movzwl 8(%esp), %e[[REG:[abcd]x]]
-; X32: movw %[[REG]], 2(%{{.*}})
+; X32: movw 8(%esp), %[[REG:[abcd]]]x
+; X32: movw %[[REG]]x, 2(%{{.*}})
}
define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll
index 4b1f903..697af84 100644
--- a/test/CodeGen/X86/swizzle-2.ll
+++ b/test/CodeGen/X86/swizzle-2.ll
@@ -8,508 +8,433 @@
; illegal shuffle that is expanded into a sub-optimal sequence of instructions
; during lowering stage.
-
define <4 x i32> @swizzle_1(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_1
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_2(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_2
-; Mask: [2,1,3,0]
-; CHECK: pshufd $54
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_3(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_3:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_3
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_4(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_4:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_4
-; Mask: [3,1,0,2]
-; CHECK: pshufd $-121
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_5(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_5:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_5
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_6(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_6:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_6
-; Mask: [2,0,1,3]
-; CHECK: pshufd $-46
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_7(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_7:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_7
-; Mask: [0,2,3,1]
-; CHECK: pshufd $120
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_8(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_8:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_8
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_9(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_9:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_9
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_10(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_10:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_10
-; Mask: [1,2,0,3]
-; CHECK: pshufd $-55
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_11(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_11:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_11
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_12(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_12:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_12
-; Mask: [0,3,1,2]
-; CHECK: pshufd $-100
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_13(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_13:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_13
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x i32> @swizzle_14(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_14:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
ret <4 x i32> %2
}
-; CHECK-LABEL: swizzle_14
-; Mask: [3,0,2,1]
-; CHECK: pshufd $99
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_15(<4 x float> %v) {
+; CHECK-LABEL: swizzle_15:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_15
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_16(<4 x float> %v) {
+; CHECK-LABEL: swizzle_16:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_16
-; Mask: [2,1,3,0]
-; CHECK: pshufd $54
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_17(<4 x float> %v) {
+; CHECK-LABEL: swizzle_17:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_17
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_18(<4 x float> %v) {
+; CHECK-LABEL: swizzle_18:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_18
-; Mask: [3,1,0,2]
-; CHECK: pshufd $-121
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_19(<4 x float> %v) {
+; CHECK-LABEL: swizzle_19:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_19
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_20(<4 x float> %v) {
+; CHECK-LABEL: swizzle_20:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_20
-; Mask: [2,0,1,3]
-; CHECK: pshufd $-46
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_21(<4 x float> %v) {
+; CHECK-LABEL: swizzle_21:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_21
-; Mask: [0,2,3,1]
-; CHECK: pshufd $120
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_22(<4 x float> %v) {
+; CHECK-LABEL: swizzle_22:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_22
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_23(<4 x float> %v) {
+; CHECK-LABEL: swizzle_23:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_23
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_24(<4 x float> %v) {
+; CHECK-LABEL: swizzle_24:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,0,3]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_24
-; Mask: [1,2,0,3]
-; CHECK: pshufd $-55
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_25(<4 x float> %v) {
+; CHECK-LABEL: swizzle_25:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_25
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_26(<4 x float> %v) {
+; CHECK-LABEL: swizzle_26:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_26
-; Mask: [0,3,1,2]
-; CHECK: pshufd $-100
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_27(<4 x float> %v) {
+; CHECK-LABEL: swizzle_27:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_27
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_28(<4 x float> %v) {
+; CHECK-LABEL: swizzle_28:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,2,1]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_28
-; Mask: [3,0,2,1]
-; CHECK: pshufd $99
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
define <4 x float> @swizzle_29(<4 x float> %v) {
+; CHECK-LABEL: swizzle_29:
+; CHECK: # BB#0:
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
%2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
ret <4 x float> %2
}
-; CHECK-LABEL: swizzle_29
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
; Make sure that we combine the shuffles from each function below into a single
; legal shuffle (either pshuflw or pshufb depending on the masks).
define <8 x i16> @swizzle_30(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_30:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_30
-; Mask: [1,3,2,0,5,7,6,4]
-; CHECK: pshuflw $45
-; CHECK-NOT: pshufb
-; CHECK-NEXT: ret
-
define <8 x i16> @swizzle_31(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_31:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_31
-; Mask: [1,3,2,0,4,5,6,7]
-; CHECK: pshuflw $45
-; CHECK-NOT: pshufb
-; CHECK: ret
-
define <8 x i16> @swizzle_32(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_32
-; Mask: [2,3,0,1,4,5,6,7] --> equivalent to pshufd mask [1,0,2,3]
-; CHECK: pshufd $-31
-; CHECK-NOT: pshufb
-; CHECK: ret
define <8 x i16> @swizzle_33(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_33:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_33
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_34(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_34:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_34
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_35(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_35:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_35
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK: ret
-
define <8 x i16> @swizzle_36(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_36:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_36
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_37(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_37:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 5>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_37
-; Mask: [0,1,2,3,4,7,6,5]
-; CHECK: pshufhw $108
-; CHECK-NOT: pshufb
-; CHECK: ret
-
define <8 x i16> @swizzle_38(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_38:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_38
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_39(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_39:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_39
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_40(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_40:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_40
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_41(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_41:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_41
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
define <8 x i16> @swizzle_42(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_42:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
ret <8 x i16> %2
}
-; CHECK-LABEL: swizzle_42
-; Mask: [0,1,2,3,5,4,7,6]
-; CHECK: pshufhw $-79
-; CHECK-NOT: pshufb
-; CHECK: ret
-
-
diff --git a/test/CodeGen/X86/swizzle.ll b/test/CodeGen/X86/swizzle.ll
deleted file mode 100644
index 23e0c24..0000000
--- a/test/CodeGen/X86/swizzle.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movlps
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movups
-; rdar://6523650
-
- %struct.vector4_t = type { <4 x float> }
-
-define void @swizzle(i8* nocapture %a, %struct.vector4_t* nocapture %b, %struct.vector4_t* nocapture %c) nounwind {
-entry:
- %0 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2]
- %1 = load <4 x float>* %0, align 4 ; <<4 x float>> [#uses=1]
- %tmp.i = bitcast i8* %a to double* ; <double*> [#uses=1]
- %tmp1.i = load double* %tmp.i ; <double> [#uses=1]
- %2 = insertelement <2 x double> undef, double %tmp1.i, i32 0 ; <<2 x double>> [#uses=1]
- %tmp2.i = bitcast <2 x double> %2 to <4 x float> ; <<4 x float>> [#uses=1]
- %3 = shufflevector <4 x float> %1, <4 x float> %tmp2.i, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1]
- store <4 x float> %3, <4 x float>* %0, align 4
- ret void
-}
diff --git a/test/CodeGen/X86/tailcall-multiret.ll b/test/CodeGen/X86/tailcall-multiret.ll
new file mode 100644
index 0000000..a77a59c
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-multiret.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core2 | FileCheck %s
+; See PR19530
+declare double @llvm.powi.f64(double %Val, i32 %power)
+define <3 x double> @julia_foo17589(i32 %arg) {
+ %tmp1 = call double @llvm.powi.f64(double 1.000000e+00, i32 %arg)
+; CHECK: callq __powidf2
+ %tmp2 = insertelement <3 x double> undef, double %tmp1, i32 0
+ %tmp3 = call double @llvm.powi.f64(double 2.000000e+00, i32 %arg)
+; CHECK: callq __powidf2
+ %tmp4 = insertelement <3 x double> %tmp2, double %tmp3, i32 1
+ %tmp5 = call double @llvm.powi.f64(double 3.000000e+00, i32 %arg)
+; CHECK: callq __powidf2
+ %tmp6 = insertelement <3 x double> %tmp4, double %tmp5, i32 2
+; CHECK-NOT: TAILCALL
+ ret <3 x double> %tmp6
+}
diff --git a/test/CodeGen/X86/tls-addr-non-leaf-function.ll b/test/CodeGen/X86/tls-addr-non-leaf-function.ll
new file mode 100644
index 0000000..ec47232
--- /dev/null
+++ b/test/CodeGen/X86/tls-addr-non-leaf-function.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -relocation-model=pic -O2 -disable-fp-elim -o - | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -O2 -o - | FileCheck %s
+
+; This test runs twice with different options regarding the frame pointer:
+; first the elimination is disabled, then it is enabled. The disabled case is
+; the "control group".
+; The function 'foo' below is marked with the "no-frame-pointer-elim-non-leaf"
+; attribute which dictates that the frame pointer should not be eliminated
+; unless the function is a leaf (i.e. it doesn't call any other function).
+; Now, 'foo' is not a leaf function, because it performs a TLS access which on
+; X86 ELF in PIC mode is expanded as a library call.
+; This call is represented with a pseudo-instruction which doesn't appear to be
+; a call when inspected by the analysis passes (it doesn't have the "isCall"
+; flag), and the ISel lowering code creating the pseudo was not informing the
+; MachineFrameInfo that the function contained calls. This affected the decision
+; whether to eliminate the frame pointer.
+; With the fix, the "hasCalls" flag is set in the MFI for the function whenever
+; a TLS access pseudo-instruction is created, so 'foo' appears to be a non-leaf
+; function, and the difference in the options does not affect codegen: both
+; versions will have a frame pointer.
+
+; Test that there's some frame pointer usage in 'foo'...
+; CHECK: foo:
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; ... and the TLS library call is also present.
+; CHECK: leaq x@TLSGD(%rip), %rdi
+; CHECK: callq __tls_get_addr@PLT
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = thread_local global i32 0
+define i32 @foo() "no-frame-pointer-elim-non-leaf" {
+ %a = load i32* @x, align 4
+ ret i32 %a
+}
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index d230f1f..8de6297 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -20,7 +20,7 @@ define void @load_2_i8(<2 x i8>* %A) {
; Read 32-bits
;CHECK: pmovzxwq
;CHECK: paddq
-;CHECK: pshufb
+;CHECK: pshufd
;CHECK: movd
;CHECK: ret
define void @load_2_i16(<2 x i16>* %A) {
@@ -32,7 +32,7 @@ define void @load_2_i16(<2 x i16>* %A) {
;CHECK-LABEL: load_2_i32:
;CHECK: pmovzxdq
-;CHECK: paddq
+;CHECK: paddd
;CHECK: pshufd
;CHECK: ret
define void @load_2_i32(<2 x i32>* %A) {
@@ -56,7 +56,7 @@ define void @load_4_i8(<4 x i8>* %A) {
;CHECK-LABEL: load_4_i16:
;CHECK: pmovzxwd
-;CHECK: paddd
+;CHECK: paddw
;CHECK: pshufb
;CHECK: ret
define void @load_4_i16(<4 x i16>* %A) {
@@ -68,7 +68,7 @@ define void @load_4_i16(<4 x i16>* %A) {
;CHECK-LABEL: load_8_i8:
;CHECK: pmovzxbw
-;CHECK: paddw
+;CHECK: paddb
;CHECK: pshufb
;CHECK: ret
define void @load_8_i8(<8 x i8>* %A) {
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index c5a61c3..e47f154 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -1,15 +1,20 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s
; rdar://6504833
define float @test1(i32 %x) nounwind readnone {
-; CHECK: test1
-; CHECK: movd
-; CHECK: orps
-; CHECK: subsd
-; CHECK: cvtsd2ss
-; CHECK: movss
-; CHECK: flds
-; CHECK: ret
+; CHECK-LABEL: test1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: movsd .LCPI0_0, %xmm0
+; CHECK-NEXT: movd {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT: orps %xmm0, %xmm1
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0
+; CHECK-NEXT: movss %xmm0, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: retl
entry:
%0 = uitofp i32 %x to float
ret float %0
@@ -17,15 +22,20 @@ entry:
; PR10802
define float @test2(<4 x i32> %x) nounwind readnone ssp {
-; CHECK: test2
-; CHECK: xorps [[ZERO:%xmm[0-9]+]]
-; CHECK: movss {{.*}}, [[ZERO]]
-; CHECK: orps
-; CHECK: subsd
-; CHECK: cvtsd2ss
-; CHECK: movss
-; CHECK: flds
-; CHECK: ret
+; CHECK-LABEL: test2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: movss %xmm0, %xmm1
+; CHECK-NEXT: movsd .LCPI1_0, %xmm0
+; CHECK-NEXT: orps %xmm0, %xmm1
+; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0
+; CHECK-NEXT: movss %xmm0, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: retl
entry:
%vecext = extractelement <4 x i32> %x, i32 0
%conv = uitofp i32 %vecext to float
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index d7ae469..ca9ea4a 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -21,16 +21,16 @@ entry:
!llvm.dbg.cu = !{!3}
!llvm.module.flags = !{!12}
-!0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00x\001\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", metadata !10, metadata !2, metadata !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00producer\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
!5 = metadata !{metadata !6}
-!6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0xb\001\0030\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
!8 = metadata !{i32 4, i32 3, metadata !7, null}
!9 = metadata !{metadata !1}
!10 = metadata !{metadata !"test.c", metadata !"/dir"}
!11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll
deleted file mode 100644
index fca4da6..0000000
--- a/test/CodeGen/X86/v-binop-widen.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s
-; CHECK: divps
-; CHECK: divps
-; CHECK: divss
-
-%vec = type <9 x float>
-define %vec @vecdiv( %vec %p1, %vec %p2)
-{
- %result = fdiv %vec %p1, %p2
- ret %vec %result
-}
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
deleted file mode 100644
index 3342111..0000000
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
-
-%vec = type <6 x float>
-; CHECK: divps
-; CHECK: divss
-; CHECK: divss
-
-; Scheduler causes a different instruction order to be produced on Intel Atom
-; ATOM: divps
-; ATOM: divss
-; ATOM: divss
-
-define %vec @vecdiv( %vec %p1, %vec %p2)
-{
- %result = fdiv %vec %p1, %p2
- ret %vec %result
-}
-
-@a = constant %vec < float 2.0, float 4.0, float 8.0, float 16.0, float 32.0, float 64.0 >
-@b = constant %vec < float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0 >
-
-; Expected result: < 1.0, 2.0, 4.0, ..., 2.0^(n-1) >
-; main() returns 0 if the result is expected and 1 otherwise
-; to execute, use llvm-as < %s | lli
-define i32 @main() nounwind {
-entry:
- %avec = load %vec* @a
- %bvec = load %vec* @b
-
- %res = call %vec @vecdiv(%vec %avec, %vec %bvec)
- br label %loop
-loop:
- %idx = phi i32 [0, %entry], [%nextInd, %looptail]
- %expected = phi float [1.0, %entry], [%nextExpected, %looptail]
- %elem = extractelement %vec %res, i32 %idx
- %expcmp = fcmp oeq float %elem, %expected
- br i1 %expcmp, label %looptail, label %return
-looptail:
- %nextExpected = fmul float %expected, 2.0
- %nextInd = add i32 %idx, 1
- %cmp = icmp slt i32 %nextInd, 6
- br i1 %cmp, label %loop, label %return
-return:
- %retval = phi i32 [0, %looptail], [1, %loop]
- ret i32 %retval
-}
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index dab5e7b..b9bd80f9 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -1,115 +1,94 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -asm-verbose=0 -o - | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32
; PR7518
define void @test1(<2 x float> %Q, float *%P2) nounwind {
+; X64-LABEL: test1:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT: addss %xmm0, %xmm1
+; X64-NEXT: movss %xmm1, (%rdi)
+; X64-NEXT: retq
+;
+; X32-LABEL: test1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT: addss %xmm0, %xmm1
+; X32-NEXT: movss %xmm1, (%eax)
+; X32-NEXT: retl
%a = extractelement <2 x float> %Q, i32 0
%b = extractelement <2 x float> %Q, i32 1
%c = fadd float %a, %b
-
store float %c, float* %P2
ret void
-; X64-LABEL: test1:
-; X64-NEXT: pshufd $1, %xmm0, %xmm1
-; X64-NEXT: addss %xmm0, %xmm1
-; X64-NEXT: movss %xmm1, (%rdi)
-; X64-NEXT: ret
-
-; W64-LABEL: test1:
-; W64-NEXT: movdqa (%rcx), %xmm0
-; W64-NEXT: pshufd $1, %xmm0, %xmm1
-; W64-NEXT: addss %xmm0, %xmm1
-; W64-NEXT: movss %xmm1, (%rdx)
-; W64-NEXT: ret
-
-; X32-LABEL: test1:
-; X32-NEXT: movl 4(%esp), %eax
-; X32-NEXT: pshufd $1, %xmm0, %xmm1
-; X32-NEXT: addss %xmm0, %xmm1
-; X32-NEXT: movss %xmm1, (%eax)
-; X32-NEXT: ret
}
-
define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounwind {
- %Z = fadd <2 x float> %Q, %R
- ret <2 x float> %Z
-
; X64-LABEL: test2:
-; X64-NEXT: addps %xmm1, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test2:
-; W64-NEXT: movaps (%rcx), %xmm0
-; W64-NEXT: addps (%rdx), %xmm0
-; W64-NEXT: ret
-
+; X64: # BB#0:
+; X64-NEXT: addps %xmm1, %xmm0
+; X64-NEXT: retq
+;
; X32-LABEL: test2:
-; X32: addps %xmm1, %xmm0
+; X32: # BB#0:
+; X32-NEXT: addps %xmm1, %xmm0
+; X32-NEXT: retl
+ %Z = fadd <2 x float> %Q, %R
+ ret <2 x float> %Z
}
-
define <2 x float> @test3(<4 x float> %A) nounwind {
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test3:
+; X32: # BB#0:
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: retl
%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%C = fadd <2 x float> %B, %B
ret <2 x float> %C
-; X64-LABEL: test3:
-; X64-NEXT: addps %xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test3:
-; W64-NEXT: movaps (%rcx), %xmm0
-; W64-NEXT: addps %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test3:
-; X32-NEXT: addps %xmm0, %xmm0
-; X32-NEXT: ret
}
define <2 x float> @test4(<2 x float> %A) nounwind {
- %C = fadd <2 x float> %A, %A
- ret <2 x float> %C
; X64-LABEL: test4:
-; X64-NEXT: addps %xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test4:
-; W64-NEXT: movaps (%rcx), %xmm0
-; W64-NEXT: addps %xmm0, %xmm0
-; W64-NEXT: ret
-
+; X64: # BB#0:
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: retq
+;
; X32-LABEL: test4:
-; X32-NEXT: addps %xmm0, %xmm0
-; X32-NEXT: ret
+; X32: # BB#0:
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: retl
+ %C = fadd <2 x float> %A, %A
+ ret <2 x float> %C
}
define <4 x float> @test5(<4 x float> %A) nounwind {
+; X64-LABEL: test5:
+; X64: # BB#0:
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test5:
+; X32: # BB#0:
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: retl
%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%C = fadd <2 x float> %B, %B
- br label %BB
-
+ br label %BB
+
BB:
- %D = fadd <2 x float> %C, %C
+ %D = fadd <2 x float> %C, %C
%E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
ret <4 x float> %E
-
-; X64-LABEL: test5:
-; X64-NEXT: addps %xmm0, %xmm0
-; X64-NEXT: addps %xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test5:
-; W64-NEXT: movaps (%rcx), %xmm0
-; W64-NEXT: addps %xmm0, %xmm0
-; W64-NEXT: addps %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test5:
-; X32-NEXT: addps %xmm0, %xmm0
-; X32-NEXT: addps %xmm0, %xmm0
-; X32-NEXT: ret
}
diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
new file mode 100644
index 0000000..2dcf319
--- /dev/null
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+
+declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...)
+define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+ call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+ call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+ %t1 = add i32 %b, %c
+ %r = add i32 %t1, %d
+ ret i32 %r
+}
+
+; CHECK: _call_varargs_thiscall_thunk:
+; CHECK: calll _thiscall_thunk
+; CHECK-NEXT: subl $8, %esp
+
+; We don't mangle the argument size into variadic callee cleanup functions.
+
+declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...)
+define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+ call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+ call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+ %t1 = add i32 %b, %c
+ %r = add i32 %t1, %d
+ ret i32 %r
+}
+
+; CHECK: _call_varargs_stdcall_thunk:
+; CHECK: calll _stdcall_thunk{{$}}
+; CHECK-NEXT: subl $12, %esp
+
+declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...)
+define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+ call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+ call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+ %t1 = add i32 %b, %c
+ %r = add i32 %t1, %d
+ ret i32 %r
+}
+
+; CHECK: _call_varargs_fastcall_thunk:
+; CHECK: calll @fastcall_thunk{{$}}
+; CHECK-NEXT: subl $4, %esp
+
+; If you actually return from such a thunk, it will only pop the non-variadic
+; portion of the arguments, which is different from what the callee passes.
+
+define x86_stdcallcc void @varargs_stdcall_return(i32, i32, ...) {
+ ret void
+}
+
+; CHECK: _varargs_stdcall_return:
+; CHECK: retl $8
diff --git a/test/CodeGen/X86/vararg_no_start.ll b/test/CodeGen/X86/vararg_no_start.ll
new file mode 100644
index 0000000..ab5c6fc
--- /dev/null
+++ b/test/CodeGen/X86/vararg_no_start.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+define void @foo(i8*, ...) {
+ ret void
+}
+; CHECK-LABEL: {{^_?}}foo:
+; CHECK-NOT: movq
+; CHECK: retq
diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll
index 6017753..d0c5150 100644
--- a/test/CodeGen/X86/vastart-defs-eflags.ll
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll
@@ -14,6 +14,7 @@ entry:
br i1 %tobool, label %if.end, label %if.then
if.then: ; preds = %entry
+ call void @llvm.va_start(i8* null)
br label %if.end
if.end: ; preds = %entry, %if.then
@@ -21,3 +22,4 @@ if.end: ; preds = %entry, %if.then
ret i32 %hasflag
}
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 1a6c05d..8600c48 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -1,75 +1,177 @@
; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
-;CHECK-LABEL: foo1_8:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo1_8:
-;CHECK-WIDE: vpmovzxbd %xmm0, %xmm1
-;CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1
-;CHECK-WIDE-NEXT: vpshufb {{.*}}, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-;CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-;CHECK-WIDE-NEXT: ret
define <8 x float> @foo1_8(<8 x i8> %src) {
+; CHECK-LABEL: foo1_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0
+; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT: vpslld $24, %xmm1, %xmm1
+; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+; CHECK-WIDE-LABEL: foo1_8:
+; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm1
+; CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-WIDE-NEXT: retl
%res = sitofp <8 x i8> %src to <8 x float>
ret <8 x float> %res
}
-;CHECK-LABEL: foo1_4:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo1_4:
-;CHECK-WIDE: vpmovzxbd %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-;CHECK-WIDE-NEXT: ret
define <4 x float> @foo1_4(<4 x i8> %src) {
+; CHECK-LABEL: foo1_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-WIDE-LABEL: foo1_4:
+; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-WIDE-NEXT: retl
%res = sitofp <4 x i8> %src to <4 x float>
ret <4 x float> %res
}
-;CHECK-LABEL: foo2_8:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo2_8:
-;CHECK-WIDE: vcvtdq2ps %ymm{{.*}}, %ymm{{.*}}
-;CHECK-WIDE: ret
define <8 x float> @foo2_8(<8 x i8> %src) {
+; CHECK-LABEL: foo2_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vandps LCPI2_0, %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+; CHECK-WIDE-LABEL: foo2_8:
+; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-WIDE-NEXT: retl
%res = uitofp <8 x i8> %src to <8 x float>
ret <8 x float> %res
}
-;CHECK-LABEL: foo2_4:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo2_4:
-;CHECK-WIDE: vcvtdq2ps %xmm{{.*}}, %xmm{{.*}}
-;CHECK-WIDE: ret
define <4 x float> @foo2_4(<4 x i8> %src) {
+; CHECK-LABEL: foo2_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps LCPI3_0, %xmm0, %xmm0
+; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-WIDE-LABEL: foo2_4:
+; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-WIDE-NEXT: retl
%res = uitofp <4 x i8> %src to <4 x float>
ret <4 x float> %res
}
-;CHECK-LABEL: foo3_8:
-;CHECK: vcvttps2dq
-;CHECK: ret
define <8 x i8> @foo3_8(<8 x float> %src) {
+; CHECK-LABEL: foo3_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+;
+; CHECK-WIDE-LABEL: foo3_8:
+; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT: shll $8, %eax
+; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
+; CHECK-WIDE-NEXT: movzbl %cl, %ecx
+; CHECK-WIDE-NEXT: orl %eax, %ecx
+; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT: shll $8, %eax
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
+; CHECK-WIDE-NEXT: movzbl %dl, %edx
+; CHECK-WIDE-NEXT: orl %eax, %edx
+; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1
+; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: shll $8, %eax
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT: movzbl %cl, %ecx
+; CHECK-WIDE-NEXT: orl %eax, %ecx
+; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: shll $8, %eax
+; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT: movzbl %cl, %ecx
+; CHECK-WIDE-NEXT: orl %eax, %ecx
+; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; CHECK-WIDE-NEXT: vzeroupper
+; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i8>
ret <8 x i8> %res
}
-;CHECK-LABEL: foo3_4:
-;CHECK: vcvttps2dq
-;CHECK: ret
+
define <4 x i8> @foo3_4(<4 x float> %src) {
+; CHECK-LABEL: foo3_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: retl
+;
+; CHECK-WIDE-LABEL: foo3_4:
+; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT: shll $8, %eax
+; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
+; CHECK-WIDE-NEXT: movzbl %cl, %ecx
+; CHECK-WIDE-NEXT: orl %eax, %ecx
+; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT: shll $8, %eax
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
+; CHECK-WIDE-NEXT: movzbl %dl, %edx
+; CHECK-WIDE-NEXT: orl %eax, %edx
+; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i8>
ret <4 x i8> %res
}
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
deleted file mode 100644
index 4da7953..0000000
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=i686-linux -mcpu=penryn | FileCheck %s
-
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
-entry:
-; CHECK: cfi_def_cfa_offset
-; CHECK-NOT: set
-; CHECK: pmovzxwq
-; CHECK: pshufb
- %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
- %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
- %sub322.i = sub <4 x i32> %shr.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %cmp323.x = icmp slt <4 x i32> zeroinitializer, %sub322.i ; <<4 x i1>> [#uses=1]
- %cmp323.i = sext <4 x i1> %cmp323.x to <4 x i32> ; <<4 x i32>> [#uses=1]
- %or.i = or <4 x i32> %cmp318.i, %cmp323.i ; <<4 x i32>> [#uses=1]
- %tmp10.i83.i = bitcast <4 x i32> %or.i to <4 x float> ; <<4 x float>> [#uses=1]
- %0 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> undef, <4 x float> undef, <4 x float> %tmp10.i83.i) nounwind ; <<4 x float>> [#uses=1]
- %conv.i.i15.i = bitcast <4 x float> %0 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %swz.i.i28.i = shufflevector <4 x i32> %conv.i.i15.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i32>> [#uses=1]
- %tmp6.i29.i = bitcast <2 x i32> %swz.i.i28.i to <4 x i16> ; <<4 x i16>> [#uses=1]
- %swz.i30.i = shufflevector <4 x i16> %tmp6.i29.i, <4 x i16> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i16>> [#uses=1]
- store <2 x i16> %swz.i30.i, <2 x i16>* undef
- unreachable
- ret void
-}
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 0aa72b1..318aca1 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 3cb519a..530911a 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,10 +1,14 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 -o %t
-; RUN: not grep extractps %t
-; RUN: not grep pextrd %t
-; RUN: not grep pshufd %t
-; RUN: not grep movss %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s
define void @t1(float* %R, <4 x float>* %P1) nounwind {
+; CHECK-LABEL: t1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movss 12(%ecx), %xmm0
+; CHECK-NEXT: movss %xmm0, (%eax)
+; CHECK-NEXT: retl
+
%X = load <4 x float>* %P1
%tmp = extractelement <4 x float> %X, i32 3
store float %tmp, float* %R
@@ -12,12 +16,31 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
}
define float @t2(<4 x float>* %P1) nounwind {
+; CHECK-LABEL: t2:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movapd (%eax), %xmm0
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: movss %xmm0, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: retl
+
%X = load <4 x float>* %P1
%tmp = extractelement <4 x float> %X, i32 2
ret float %tmp
}
define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
+; CHECK-LABEL: t3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl 12(%ecx), %ecx
+; CHECK-NEXT: movl %ecx, (%eax)
+; CHECK-NEXT: retl
+
%X = load <4 x i32>* %P1
%tmp = extractelement <4 x i32> %X, i32 3
store i32 %tmp, i32* %R
@@ -25,6 +48,12 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
}
define i32 @t4(<4 x i32>* %P1) nounwind {
+; CHECK-LABEL: t4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl 12(%eax), %eax
+; CHECK-NEXT: retl
+
%X = load <4 x i32>* %P1
%tmp = extractelement <4 x i32> %X, i32 3
ret i32 %tmp
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 88f5a58..6df7be7 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,10 +1,17 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t
-; RUN: grep movss %t | count 4
-; RUN: grep movhlps %t | count 1
-; RUN: not grep pshufd %t
-; RUN: grep unpckhpd %t | count 1
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
define void @test1(<4 x float>* %F, float* %f) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movaps (%ecx), %xmm0
+; CHECK-NEXT: addps %xmm0, %xmm0
+; CHECK-NEXT: movss %xmm0, (%eax)
+; CHECK-NEXT: retl
+entry:
%tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2]
%tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1]
%tmp2 = extractelement <4 x float> %tmp7, i32 0 ; <float> [#uses=1]
@@ -13,6 +20,18 @@ define void @test1(<4 x float>* %F, float* %f) nounwind {
}
define float @test2(<4 x float>* %F, float* %f) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movaps (%eax), %xmm0
+; CHECK-NEXT: addps %xmm0, %xmm0
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: movss %xmm0, (%esp)
+; CHECK-NEXT: flds (%esp)
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: retl
+entry:
%tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2]
%tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1]
%tmp2 = extractelement <4 x float> %tmp7, i32 2 ; <float> [#uses=1]
@@ -20,6 +39,14 @@ define float @test2(<4 x float>* %F, float* %f) nounwind {
}
define void @test3(float* %R, <4 x float>* %P1) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movss 12(%ecx), %xmm0
+; CHECK-NEXT: movss %xmm0, (%eax)
+; CHECK-NEXT: retl
+entry:
%X = load <4 x float>* %P1 ; <<4 x float>> [#uses=1]
%tmp = extractelement <4 x float> %X, i32 3 ; <float> [#uses=1]
store float %tmp, float* %R
@@ -27,6 +54,17 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind {
}
define double @test4(double %A) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: calll foo
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movsd %xmm0, (%esp)
+; CHECK-NEXT: fldl (%esp)
+; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: retl
+entry:
%tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1]
%tmp2 = extractelement <2 x double> %tmp1, i32 1 ; <double> [#uses=1]
%tmp3 = fadd double %tmp2, %A ; <double> [#uses=1]
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 82517cb..ac02acf 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
define <2 x double> @fabs_v2f64(<2 x double> %p)
{
- ; CHECK: fabs_v2f64
+ ; CHECK-LABEL: fabs_v2f64
; CHECK: vandps
%t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
ret <2 x double> %t
@@ -12,7 +12,7 @@ declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
define <4 x float> @fabs_v4f32(<4 x float> %p)
{
- ; CHECK: fabs_v4f32
+ ; CHECK-LABEL: fabs_v4f32
; CHECK: vandps
%t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
ret <4 x float> %t
@@ -21,7 +21,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
define <4 x double> @fabs_v4f64(<4 x double> %p)
{
- ; CHECK: fabs_v4f64
+ ; CHECK-LABEL: fabs_v4f64
; CHECK: vandps
%t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
ret <4 x double> %t
@@ -30,9 +30,46 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
define <8 x float> @fabs_v8f32(<8 x float> %p)
{
- ; CHECK: fabs_v8f32
+ ; CHECK-LABEL: fabs_v8f32
; CHECK: vandps
%t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+; PR20354: when generating code for a vector fabs op,
+; make sure that we're only turning off the sign bit of each float value.
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return an integer constant
+; that has the sign bits turned off.
+;
+; So instead of something like this:
+; movabsq (constant pool load of mask for sign bits)
+; vmovq (move from integer register to vector/fp register)
+; vandps (mask off sign bits)
+; vmovq (move vector/fp register back to integer return register)
+;
+; We should generate:
+; mov (put constant value in return register)
+
+define i64 @fabs_v2f32_1() {
+; CHECK-LABEL: fabs_v2f32_1:
+; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+define i64 @fabs_v2f32_2() {
+; CHECK-LABEL: fabs_v2f32_2:
+; CHECK: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index d49c70e..9743f71 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -1,11 +1,45 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+; FNEG is defined as subtraction from -0.0.
+
+; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed.
define <4 x float> @t1(<4 x float> %Q) {
- %tmp15 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
- ret <4 x float> %tmp15
+; CHECK-LABEL: t1:
+; CHECK: xorps {{.*}}LCPI0_0{{.*}}, %xmm0
+; CHECK-NEXT: retq
+ %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
+ ret <4 x float> %tmp
}
+; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg.
define <4 x float> @t2(<4 x float> %Q) {
- %tmp15 = fsub <4 x float> zeroinitializer, %Q
- ret <4 x float> %tmp15
+; CHECK-LABEL: t2:
+; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]]
+; CHECK-NEXT: subps %xmm0, %[[X]]
+; CHECK-NEXT: movaps %[[X]], %xmm0
+; CHECK-NEXT: retq
+ %tmp = fsub <4 x float> zeroinitializer, %Q
+ ret <4 x float> %tmp
+}
+
+; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely.
+; Make sure that we're flipping the sign bit and only the sign bit of each float.
+; So instead of something like this:
+; movd %rdi, %xmm0
+; xorps .LCPI2_0(%rip), %xmm0
+;
+; We should generate:
+; movabsq (put sign bit mask in integer register))
+; xorq (flip sign bits)
+; movd (move to xmm return register)
+
+define <2 x float> @fneg_bitcast(i64 %i) {
+; CHECK-LABEL: fneg_bitcast:
+; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
+; CHECK-NEXT: xorq %rdi, %rax
+; CHECK-NEXT: movd %rax, %xmm0
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 %i to <2 x float>
+ %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
+ ret <2 x float> %fneg
}
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 7ec07ae..b882a5e 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -3,6 +3,8 @@
; PR11674
define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
+; CHECK-LABEL: fpext_frommem:
+; AVX-LABEL: fpext_frommem:
entry:
; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
@@ -13,6 +15,8 @@ entry:
}
define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+; CHECK-LABEL: fpext_frommem4:
+; AVX-LABEL: fpext_frommem4:
entry:
; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
@@ -24,6 +28,8 @@ entry:
}
define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+; CHECK-LABEL: fpext_frommem8:
+; AVX-LABEL: fpext_frommem8:
entry:
; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 5cb9f69..b72044a 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -2,66 +2,87 @@
; There are no MMX operations in @t1
define void @t1(i32 %a, x86_mmx* %P) nounwind {
- %tmp12 = shl i32 %a, 12
- %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
- %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
- %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
- store x86_mmx %tmp23, x86_mmx* %P
- ret void
-
; CHECK-LABEL: t1:
-; CHECK-NOT: %mm
-; CHECK: shll $12
-; CHECK-NOT: %mm
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: shll $12, %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
+; CHECK-NEXT: movlpd %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %tmp12 = shl i32 %a, 12
+ %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
+ %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
+ %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
+ store x86_mmx %tmp23, x86_mmx* %P
+ ret void
}
define <4 x float> @t2(<4 x float>* %P) nounwind {
- %tmp1 = load <4 x float>* %P
- %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
- ret <4 x float> %tmp2
-
; CHECK-LABEL: t2:
-; CHECK: pslldq $12
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movaps (%eax), %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; CHECK-NEXT: retl
+ %tmp1 = load <4 x float>* %P
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+ ret <4 x float> %tmp2
}
define <4 x float> @t3(<4 x float>* %P) nounwind {
- %tmp1 = load <4 x float>* %P
- %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
- ret <4 x float> %tmp2
-
; CHECK-LABEL: t3:
-; CHECK: psrldq $8
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movaps (%eax), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0]
+; CHECK-NEXT: retl
+ %tmp1 = load <4 x float>* %P
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
+ ret <4 x float> %tmp2
}
define <4 x float> @t4(<4 x float>* %P) nounwind {
- %tmp1 = load <4 x float>* %P
- %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
- ret <4 x float> %tmp2
-
; CHECK-LABEL: t4:
-; CHECK: psrldq $12
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movaps (%eax), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; CHECK-NEXT: retl
+ %tmp1 = load <4 x float>* %P
+ %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
+ ret <4 x float> %tmp2
}
define <16 x i8> @t5(<16 x i8> %x) nounwind {
- %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
- ret <16 x i8> %s
-
; CHECK-LABEL: t5:
-; CHECK: psrldq $1
+; CHECK: # BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
+ %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
+ ret <16 x i8> %s
}
define <16 x i8> @t6(<16 x i8> %x) nounwind {
- %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i8> %s
-
; CHECK-LABEL: t6:
-; CHECK: palignr $1
+; CHECK: # BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
+ %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %s
}
define <16 x i8> @t7(<16 x i8> %x) nounwind {
- %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
- ret <16 x i8> %s
-
; CHECK-LABEL: t7:
-; CHECK: pslldq $13
+; CHECK: # BB#0:
+; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; CHECK-NEXT: retl
+ %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
+ ret <16 x i8> %s
}
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
deleted file mode 100644
index 4583e19..0000000
--- a/test/CodeGen/X86/vec_insert-6.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
-
-define <4 x float> @t3(<4 x float>* %P) nounwind {
- %tmp1 = load <4 x float>* %P
- %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
- ret <4 x float> %tmp2
-}
diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll
deleted file mode 100644
index 0ed8f10..0000000
--- a/test/CodeGen/X86/vec_insert.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw
-
-define void @test(<4 x float>* %F, i32 %I) nounwind {
- %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=1]
- %f = sitofp i32 %I to float ; <float> [#uses=1]
- %tmp1 = insertelement <4 x float> %tmp, float %f, i32 0 ; <<4 x float>> [#uses=2]
- %tmp18 = fadd <4 x float> %tmp1, %tmp1 ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp18, <4 x float>* %F
- ret void
-}
-
-define void @test2(<4 x float>* %F, i32 %I, float %g) nounwind {
- %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=1]
- %f = sitofp i32 %I to float ; <float> [#uses=1]
- %tmp1 = insertelement <4 x float> %tmp, float %f, i32 2 ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp1, <4 x float>* %F
- ret void
-}
diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index 2cf5dc6..f7fcd03 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll
@@ -10,7 +10,7 @@ define <2 x double> @test() {
; Prefer a constant pool load here.
; CHECK: test2
; CHECK-NOT: shuf
-; CHECK: movaps {{.*}}CPI
+; CHECK: movaps {{.*}}{{CPI|__xmm@}}
define <4 x i32> @test2() nounwind {
ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 >
}
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index d1d7608..a13c813 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,17 +1,37 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t
-; RUN: grep pshufd %t | count 2
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
-define <4 x float> @test(float %a) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1]
- %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp6
+define <4 x float> @test(float %a) {
+; CHECK-LABEL: test:
+; CHECK: insertps $29, {{.*}}, %xmm0
+; CHECK-NEXT: retl
+
+entry:
+ %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
+ %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
+ %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
+ ret <4 x float> %tmp6
}
-define <2 x i64> @test2(i32 %a) nounwind {
- %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 ; <<4 x i32>> [#uses=1]
- %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 ; <<4 x i32>> [#uses=1]
- %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp10
+define <2 x i64> @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK: movd {{.*}}, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; CHECK-NEXT: retl
+
+entry:
+ %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
+ %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
+ %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
+ ret <2 x i64> %tmp10
}
+define <4 x float> @test3(<4 x float> %A) {
+; CHECK-LABEL: test3:
+; CHECK: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; CHECK-NEXT: retl
+
+ %tmp0 = extractelement <4 x float> %A, i32 0
+ %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
+ %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
+ ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll
deleted file mode 100644
index f811a74..0000000
--- a/test/CodeGen/X86/vec_set-5.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movlhps %t | count 1
-; RUN: grep movq %t | count 2
-
-define <4 x float> @test1(float %a, float %b) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]
- %tmp6 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
- %tmp8 = insertelement <4 x float> %tmp6, float %b, i32 2 ; <<4 x float>> [#uses=1]
- %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp9
-}
-
-define <4 x float> @test2(float %a, float %b) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]
- %tmp7 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
- %tmp8 = insertelement <4 x float> %tmp7, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp9
-}
-
-define <2 x i64> @test3(i32 %a, i32 %b) nounwind {
- %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 ; <<4 x i32>> [#uses=1]
- %tmp6 = insertelement <4 x i32> %tmp, i32 %b, i32 1 ; <<4 x i32>> [#uses=1]
- %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 ; <<4 x i32>> [#uses=1]
- %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3 ; <<4 x i32>> [#uses=1]
- %tmp11 = bitcast <4 x i32> %tmp10 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp11
-}
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
deleted file mode 100644
index a739090..0000000
--- a/test/CodeGen/X86/vec_set-9.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s
-
-; CHECK: test3
-; CHECK: movd
-; CHECK-NOT: movd
-; CHECK: {{movlhps.*%xmm0, %xmm0}}
-; CHECK-NEXT: ret
-
-define <2 x i64> @test3(i64 %A) nounwind {
-entry:
- %B = insertelement <2 x i64> undef, i64 %A, i32 1
- ret <2 x i64> %B
-}
-
diff --git a/test/CodeGen/X86/vec_set-E.ll b/test/CodeGen/X86/vec_set-E.ll
deleted file mode 100644
index d78be66..0000000
--- a/test/CodeGen/X86/vec_set-E.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq
-
-define <4 x float> @t(float %X) nounwind {
- %tmp11 = insertelement <4 x float> undef, float %X, i32 0
- %tmp12 = insertelement <4 x float> %tmp11, float %X, i32 1
- %tmp27 = insertelement <4 x float> %tmp12, float 0.000000e+00, i32 2
- %tmp28 = insertelement <4 x float> %tmp27, float 0.000000e+00, i32 3
- ret <4 x float> %tmp28
-}
diff --git a/test/CodeGen/X86/vec_set-G.ll b/test/CodeGen/X86/vec_set-G.ll
deleted file mode 100644
index 4a542fe..0000000
--- a/test/CodeGen/X86/vec_set-G.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss
-
-define fastcc void @t(<4 x float> %A) nounwind {
- %tmp41896 = extractelement <4 x float> %A, i32 0 ; <float> [#uses=1]
- %tmp14082 = insertelement <4 x float> < float 0.000000e+00, float undef, float undef, float undef >, float %tmp41896, i32 1 ; <<4 x float>> [#uses=1]
- %tmp14083 = insertelement <4 x float> %tmp14082, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp14083, <4 x float>* null, align 16
- ret void
-}
diff --git a/test/CodeGen/X86/vec_set-I.ll b/test/CodeGen/X86/vec_set-I.ll
deleted file mode 100644
index c5d6ab8..0000000
--- a/test/CodeGen/X86/vec_set-I.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-; CHECK-NOT: xorp
-; CHECK: movd
-; CHECK-NOT: xorp
-
-define void @t1() nounwind {
- %tmp298.i.i = load <4 x float>* null, align 16
- %tmp304.i.i = bitcast <4 x float> %tmp298.i.i to <4 x i32>
- %tmp305.i.i = and <4 x i32> %tmp304.i.i, < i32 -1, i32 0, i32 0, i32 0 >
- store <4 x i32> %tmp305.i.i, <4 x i32>* null, align 16
- unreachable
-}
diff --git a/test/CodeGen/X86/vec_set-J.ll b/test/CodeGen/X86/vec_set-J.ll
deleted file mode 100644
index d90ab85..0000000
--- a/test/CodeGen/X86/vec_set-J.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss
-; PR2472
-
-define <4 x i32> @a(<4 x i32> %a) nounwind {
-entry:
- %vecext = extractelement <4 x i32> %a, i32 0
- insertelement <4 x i32> zeroinitializer, i32 %vecext, i32 0
- %add = add <4 x i32> %a, %0
- ret <4 x i32> %add
-}
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index 322dbae..b69f90c 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -62,8 +62,7 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
; SSE2-LABEL: v8i16_icmp_ule:
; SSE2: psubusw %xmm1, %xmm0
; SSE2: pxor %xmm1, %xmm1
-; SSE2: pcmpeqw %xmm0, %xmm1
-; SSE2: movdqa %xmm1, %xmm0
+; SSE2: pcmpeqw %xmm1, %xmm0
; SSE41-LABEL: v8i16_icmp_ule:
; SSE41: pminuw %xmm0, %xmm1
@@ -106,8 +105,7 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
; SSE2: pxor %xmm2, %xmm0
; SSE2: pcmpgtd %xmm1, %xmm0
; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor %xmm0, %xmm1
-; SSE2: movdqa %xmm1, %xmm0
+; SSE2: pxor %xmm1, %xmm0
; SSE41-LABEL: v4i32_icmp_ule:
; SSE41: pminud %xmm0, %xmm1
diff --git a/test/CodeGen/X86/vec_sext.ll b/test/CodeGen/X86/vec_sext.ll
deleted file mode 100644
index 776ddec..0000000
--- a/test/CodeGen/X86/vec_sext.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR 9267
-
-define<4 x i32> @func_16_32() {
- %F = load <4 x i16>* undef
- %G = sext <4 x i16> %F to <4 x i32>
- %H = load <4 x i16>* undef
- %Y = sext <4 x i16> %H to <4 x i32>
- %T = add <4 x i32> %Y, %G
- store <4 x i32>%T , <4 x i32>* undef
- ret <4 x i32> %T
-}
-
-define<4 x i64> @func_16_64() {
- %F = load <4 x i16>* undef
- %G = sext <4 x i16> %F to <4 x i64>
- %H = load <4 x i16>* undef
- %Y = sext <4 x i16> %H to <4 x i64>
- %T = xor <4 x i64> %Y, %G
- store <4 x i64>%T , <4 x i64>* undef
- ret <4 x i64> %T
-}
-
-define<4 x i64> @func_32_64() {
- %F = load <4 x i32>* undef
- %G = sext <4 x i32> %F to <4 x i64>
- %H = load <4 x i32>* undef
- %Y = sext <4 x i32> %H to <4 x i64>
- %T = or <4 x i64> %Y, %G
- ret <4 x i64> %T
-}
-
-define<4 x i16> @func_8_16() {
- %F = load <4 x i8>* undef
- %G = sext <4 x i8> %F to <4 x i16>
- %H = load <4 x i8>* undef
- %Y = sext <4 x i8> %H to <4 x i16>
- %T = add <4 x i16> %Y, %G
- ret <4 x i16> %T
-}
-
-define<4 x i32> @func_8_32() {
- %F = load <4 x i8>* undef
- %G = sext <4 x i8> %F to <4 x i32>
- %H = load <4 x i8>* undef
- %Y = sext <4 x i8> %H to <4 x i32>
- %T = sub <4 x i32> %Y, %G
- ret <4 x i32> %T
-}
-
-define<4 x i64> @func_8_64() {
- %F = load <4 x i8>* undef
- %G = sext <4 x i8> %F to <4 x i64>
- %H = load <4 x i8>* undef
- %Y = sext <4 x i8> %H to <4 x i64>
- %T = add <4 x i64> %Y, %G
- ret <4 x i64> %T
-}
-
-define<4 x i32> @const_16_32() {
- %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32>
- ret <4 x i32> %G
-}
-
-define<4 x i64> @const_16_64() {
- %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64>
- ret <4 x i64> %G
-}
-
diff --git a/test/CodeGen/X86/vec_shuffle-11.ll b/test/CodeGen/X86/vec_shuffle-11.ll
deleted file mode 100644
index 640745a..0000000
--- a/test/CodeGen/X86/vec_shuffle-11.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep mov
-
-define <4 x i32> @test() nounwind {
- %tmp131 = call <2 x i64> @llvm.x86.sse2.psrl.dq( <2 x i64> < i64 -1, i64 -1 >, i32 96 ) ; <<2 x i64>> [#uses=1]
- %tmp137 = bitcast <2 x i64> %tmp131 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp138 = and <4 x i32> %tmp137, bitcast (<2 x i64> < i64 -1, i64 -1 > to <4 x i32>) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp138
-}
-
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32)
diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll
deleted file mode 100644
index 8f25197..0000000
--- a/test/CodeGen/X86/vec_shuffle-14.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64
-
-define <4 x i32> @t1(i32 %a) nounwind {
-entry:
- %tmp = insertelement <4 x i32> undef, i32 %a, i32 0
- %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> < i32 4, i32 1, i32 2, i32 3 > ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp6
-
-; X86-32-LABEL: t1:
-; X86-32: movd 4(%esp), %xmm0
-
-; X86-64-LABEL: t1:
-; X86-64: movd %e{{..}}, %xmm0
-}
-
-define <2 x i64> @t2(i64 %a) nounwind {
-entry:
- %tmp = insertelement <2 x i64> undef, i64 %a, i32 0
- %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %tmp, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1]
- ret <2 x i64> %tmp6
-
-; X86-32-LABEL: t2:
-; X86-32: movq 4(%esp), %xmm0
-
-; X86-64-LABEL: t2:
-; X86-64: movd %r{{..}}, %xmm0
-}
-
-define <2 x i64> @t3(<2 x i64>* %a) nounwind {
-entry:
- %tmp4 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1]
- %tmp6 = bitcast <2 x i64> %tmp4 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp7 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp6, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1]
- %tmp8 = bitcast <4 x i32> %tmp7 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp8
-
-; X86-32-LABEL: t3:
-; X86-32: movl 4(%esp)
-; X86-32: movq
-
-; X86-64-LABEL: t3:
-; X86-64: movq ({{.*}}), %xmm0
-}
-
-define <2 x i64> @t4(<2 x i64> %a) nounwind {
-entry:
- %tmp5 = bitcast <2 x i64> %a to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp5, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1]
- %tmp7 = bitcast <4 x i32> %tmp6 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp7
-
-; X86-32-LABEL: t4:
-; X86-32: movq %xmm0, %xmm0
-
-; X86-64-LABEL: t4:
-; X86-64: movq {{.*}}, %xmm0
-}
-
-define <2 x i64> @t5(<2 x i64> %a) nounwind {
-entry:
- %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1]
- ret <2 x i64> %tmp6
-
-; X86-32-LABEL: t5:
-; X86-32: movq %xmm0, %xmm0
-
-; X86-64-LABEL: t5:
-; X86-64: movq {{.*}}, %xmm0
-}
diff --git a/test/CodeGen/X86/vec_shuffle-15.ll b/test/CodeGen/X86/vec_shuffle-15.ll
deleted file mode 100644
index 5a9b8fd..0000000
--- a/test/CodeGen/X86/vec_shuffle-15.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
-
-define <2 x i64> @t00(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 0 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t01(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 1 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t02(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 2 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t03(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 3 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t10(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 0 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t11(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 1 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t12(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 2 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 3 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t20(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 0 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t21(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 1 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t22(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 2 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t23(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 3 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t30(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 0 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t31(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 1 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t32(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 2 >
- ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t33(<2 x i64> %a, <2 x i64> %b) nounwind {
- %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 3 >
- ret <2 x i64> %tmp
-}
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
deleted file mode 100644
index 9aeb942..0000000
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
-
-; sse-LABEL: t1:
-; sse2-LABEL: t1:
-define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
- %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
- ret <4 x float> %tmp1
-}
-
-; sse-LABEL: t2:
-; sse2-LABEL: t2:
-define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
- %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 >
- ret <4 x float> %tmp
-}
-
-; sse-LABEL: t3:
-; sse2-LABEL: t3:
-define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
- %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 >
- ret <4 x float> %tmp
-}
-
-; sse-LABEL: t4:
-; sse2-LABEL: t4:
-define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind {
-
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
- %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 >
- ret <4 x float> %tmp
-}
diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll
deleted file mode 100644
index f2f96ba..0000000
--- a/test/CodeGen/X86/vec_shuffle-17.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
-; CHECK-NOT: xor
-; CHECK: movd {{%rdi|%rcx}}, %xmm0
-; CHECK-NOT: xor
-; PR2108
-
-define <2 x i64> @doload64(i64 %x) nounwind {
-entry:
- %tmp717 = bitcast i64 %x to double ; <double> [#uses=1]
- %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0 ; <<2 x double>> [#uses=1]
- %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1 ; <<2 x double>> [#uses=1]
- %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp11
-}
-
diff --git a/test/CodeGen/X86/vec_shuffle-18.ll b/test/CodeGen/X86/vec_shuffle-18.ll
deleted file mode 100644
index 1104a4a..0000000
--- a/test/CodeGen/X86/vec_shuffle-18.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7
-
- %struct.vector4_t = type { <4 x float> }
-
-define void @swizzle(i8* %a, %struct.vector4_t* %b, %struct.vector4_t* %c) nounwind {
-entry:
- %tmp9 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2]
- %tmp10 = load <4 x float>* %tmp9, align 16 ; <<4 x float>> [#uses=1]
- %tmp14 = bitcast i8* %a to double* ; <double*> [#uses=1]
- %tmp15 = load double* %tmp14 ; <double> [#uses=1]
- %tmp16 = insertelement <2 x double> undef, double %tmp15, i32 0 ; <<2 x double>> [#uses=1]
- %tmp18 = bitcast <2 x double> %tmp16 to <4 x float> ; <<4 x float>> [#uses=1]
- %tmp19 = shufflevector <4 x float> %tmp10, <4 x float> %tmp18, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp19, <4 x float>* %tmp9, align 16
- %tmp28 = getelementptr %struct.vector4_t* %c, i32 0, i32 0 ; <<4 x float>*> [#uses=2]
- %tmp29 = load <4 x float>* %tmp28, align 16 ; <<4 x float>> [#uses=1]
- %tmp26 = getelementptr i8* %a, i32 8 ; <i8*> [#uses=1]
- %tmp33 = bitcast i8* %tmp26 to double* ; <double*> [#uses=1]
- %tmp34 = load double* %tmp33 ; <double> [#uses=1]
- %tmp35 = insertelement <2 x double> undef, double %tmp34, i32 0 ; <<2 x double>> [#uses=1]
- %tmp37 = bitcast <2 x double> %tmp35 to <4 x float> ; <<4 x float>> [#uses=1]
- %tmp38 = shufflevector <4 x float> %tmp29, <4 x float> %tmp37, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp38, <4 x float>* %tmp28, align 16
- ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll
deleted file mode 100644
index 48db8de..0000000
--- a/test/CodeGen/X86/vec_shuffle-19.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
-; PR2485
-
-define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind {
-entry:
- %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 > ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %shuffle
-}
diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll
deleted file mode 100644
index 5a2c444..0000000
--- a/test/CodeGen/X86/vec_shuffle-20.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
-
-define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind {
-entry:
- shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>>:0 [#uses=1]
- ret <4 x float> %0
-}
diff --git a/test/CodeGen/X86/vec_shuffle-22.ll b/test/CodeGen/X86/vec_shuffle-22.ll
deleted file mode 100644
index 6807e4d..0000000
--- a/test/CodeGen/X86/vec_shuffle-22.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium-m | FileCheck %s
-
-define <4 x float> @t1(<4 x float> %a) nounwind {
-; CHECK: movlhps
- %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp1
-}
-
-define <4 x i32> @t2(<4 x i32>* %a) nounwind {
-; CHECK: pshufd
-; CHECK: ret
- %tmp1 = load <4 x i32>* %a
- %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp2
-}
diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll
deleted file mode 100644
index 2468735..0000000
--- a/test/CodeGen/X86/vec_shuffle-23.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep punpck
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd
-
-define i32 @t() nounwind {
-entry:
- %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2]
- %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5]
- store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a
- %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1]
- store <4 x i32> %tmp, <4 x i32>* %b
- %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1]
- %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1]
- %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1]
- store <4 x i32> %punpckldq, <4 x i32>* %b
- %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1]
- %result = extractelement <4 x i32> %tmp3, i32 0 ; <i32> [#uses=1]
- ret i32 %result
-}
diff --git a/test/CodeGen/X86/vec_shuffle-24.ll b/test/CodeGen/X86/vec_shuffle-24.ll
deleted file mode 100644
index d038daf..0000000
--- a/test/CodeGen/X86/vec_shuffle-24.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-define i32 @t() nounwind optsize {
-entry:
-; CHECK: punpckldq
- %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2]
- %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5]
- store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a
- %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1]
- store <4 x i32> %tmp, <4 x i32>* %b
- %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1]
- %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1]
- %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1]
- store <4 x i32> %punpckldq, <4 x i32>* %b
- %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1]
- %result = extractelement <4 x i32> %tmp3, i32 0 ; <i32> [#uses=1]
- ret i32 %result
-}
diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll
deleted file mode 100644
index 3f42a13..0000000
--- a/test/CodeGen/X86/vec_shuffle-25.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t
-; RUN: grep unpcklps %t | count 3
-; RUN: grep unpckhps %t | count 1
-
-; Transpose example using the more generic vector shuffle. We return
-; float8 instead of float16 since x86 can return that in register.
-; ModuleID = 'transpose2_opt.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-apple-cl.1.0"
-@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-
-define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
-entry:
- %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2]
- %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2]
- %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2]
- %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2]
- %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
- %unpcklps14a = shufflevector <4 x float> %unpcklps14, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
- %unpckhps17a = shufflevector <4 x float> %unpckhps17, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %r1 = shufflevector <16 x float> %unpcklps14a, <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
- %unpcklps20a = shufflevector <4 x float> %unpcklps20, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %r2 = shufflevector <16 x float> %r1, <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
- %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
- %unpckhps23a = shufflevector <4 x float> %unpckhps23, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %r3 = shufflevector <16 x float> %r2, <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
- %r4 = shufflevector <16 x float> %r3, <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x float> %r4
-}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
deleted file mode 100644
index 00e8e73..0000000
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
-
-; Transpose example using the more generic vector shuffle. Return float8
-; instead of float16
-; ModuleID = 'transpose2_opt.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-apple-cl.1.0"
-@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1]
-
-define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
-entry:
-; CHECK: transpose2
-; CHECK: unpckhps
-; CHECK: unpckhps
-; CHECK: unpcklps
-; CHECK: unpckhps
-; Different instruction order for Atom.
-; ATOM: transpose2
-; ATOM: unpckhps
-; ATOM: unpckhps
-; ATOM: unpckhps
-; ATOM: unpcklps
- %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2]
- %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2]
- %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2]
- %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2]
- %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
- %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
- %r1 = shufflevector <4 x float> %unpcklps14, <4 x float> %unpckhps17, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
- %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
- %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
- %r2 = shufflevector <4 x float> %unpcklps20, <4 x float> %unpckhps23, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-; %r3 = shufflevector <8 x float> %r1, <8 x float> %r2, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >;
- ret <8 x float> %r2
-}
-
-define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
-entry:
-; movhps should happen before extractps to assure it gets the correct value.
-; CHECK: lo_hi_shift
-; CHECK: movhps ([[BASEREG:%[a-z]+]]),
-; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: lo_hi_shift
-; ATOM: movhps ([[BASEREG:%[a-z]+]]),
-; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
- %v.i = bitcast float* %y to <4 x float>*
- %0 = load <4 x float>* %v.i, align 1
- %1 = bitcast float* %x to <1 x i64>*
- %.val = load <1 x i64>* %1, align 1
- %2 = bitcast <1 x i64> %.val to <2 x float>
- %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
- %cast.i = bitcast <4 x float> %0 to <2 x i64>
- %extract.i = extractelement <2 x i64> %cast.i, i32 1
- %3 = bitcast float* %x to i64*
- store i64 %extract.i, i64* %3, align 4
- %4 = bitcast <4 x float> %0 to <16 x i8>
- %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
- %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
- %6 = bitcast <16 x i8> %palignr to <2 x i64>
- ret <2 x i64> %6
-}
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
deleted file mode 100644
index c9b2fb5..0000000
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-; ModuleID = 'vec_shuffle-27.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-apple-cl.1.0"
-
-define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
-entry:
-; CHECK: subps
-; CHECK: subps
-; CHECK: mulps
-; CHECK: mulps
-; CHECK: addps
-; CHECK: addps
- %tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 > ; <<8 x float>> [#uses=1]
- %sub = fsub <8 x float> %T1, %T0 ; <<8 x float>> [#uses=1]
- %mul = fmul <8 x float> %sub, %tmp7 ; <<8 x float>> [#uses=1]
- %add = fadd <8 x float> %mul, %T0 ; <<8 x float>> [#uses=1]
- ret <8 x float> %add
-}
-
-; Test case for r122206
-define void @test2(<4 x i64>* %ap, <4 x i64>* %bp) nounwind {
-entry:
-; CHECK: movdqa
- %a = load <4 x i64> * %ap
- %b = load <4 x i64> * %bp
- %mulaa = mul <4 x i64> %a, %a
- %mulbb = mul <4 x i64> %b, %b
- %mulab = mul <4 x i64> %a, %b
- %vect1271 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
- %vect1272 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
- %vect1487 = shufflevector <4 x i64> %vect1271, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
- %vect1488 = shufflevector <4 x i64> %vect1272, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
- store <4 x i64> %vect1487, <4 x i64>* %ap
- store <4 x i64> %vect1488, <4 x i64>* %bp
- ret void;
-}
diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll
deleted file mode 100644
index ebf5577..0000000
--- a/test/CodeGen/X86/vec_shuffle-28.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
-
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-
-; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently.
-; Don't XFAIL it because it's still better than the previous code.
-
-; Pack various elements via shuffles.
-define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
- %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
- ret <8 x i16> %tmp7
-}
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
deleted file mode 100644
index f5f8842..0000000
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
-
-; CHECK: test
-; Test case when creating pshufhw, we incorrectly set the higher order bit
-; for an undef,
-define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
-entry:
-; CHECK-NOT: vmovaps
-; CHECK: vmovlpd
-; CHECK: vpshufhw $-95
- %0 = load <8 x i16>* %dest
- %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
- store <8 x i16> %1, <8 x i16>* %dest
- ret void
-}
-
-; CHECK: test2
-; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
-define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
-entry:
-; CHECK-NOT: pslldq
-; CHECK: shufps
- %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
- store <4 x i32> %0, <4 x i32>* %dest
- ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll
deleted file mode 100644
index bb06e15..0000000
--- a/test/CodeGen/X86/vec_shuffle-31.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
-; RUN: grep pshufb %t | count 1
-
-define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
- %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
- ret <8 x i16> %tmp9
-}
diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll
deleted file mode 100644
index d057b3f..0000000
--- a/test/CodeGen/X86/vec_shuffle-34.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | grep pshufb | count 2
-
-define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
- %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
- ret <8 x i16> %tmp8
-}
diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
deleted file mode 100644
index f5083b4..0000000
--- a/test/CodeGen/X86/vec_shuffle-35.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
-; RUN: grep pextrw %t | count 12
-; RUN: grep pinsrw %t | count 13
-; RUN: grep rolw %t | count 13
-; RUN: not grep esp %t
-; RUN: not grep ebp %t
-; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t
-; RUN: grep pshufb %t | count 3
-
-define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone {
-entry:
- %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
- ret <16 x i8> %tmp8
-}
-
-define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-entry:
- %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
- ret <16 x i8> %tmp8
-}
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
deleted file mode 100644
index f1d0f93..0000000
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK: ret
-entry:
- %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 2, i32 0, i32 2, i32 1, i32 5, i32 6 , i32 undef >
- ret <8 x i16> %tmp9
-}
-
-define <8 x i16> @shuf7(<8 x i16> %t0) {
-; CHECK: pshufd
- %tmp10 = shufflevector <8 x i16> %t0, <8 x i16> undef, <8 x i32> < i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef >
- ret <8 x i16> %tmp10
-}
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
deleted file mode 100644
index ed285f9..0000000
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
-
-define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
-entry:
-; CHECK: movaps ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]]
-; CHECK: movaps %[[XMM0]], %[[XMM1:xmm[0-9]+]]
-; CHECK-NEXT: movss %xmm{{[0-9]+}}, %[[XMM1]]
-; CHECK-NEXT: shufps $36, %[[XMM1]], %[[XMM0]]
- %0 = load <4 x i32>* undef, align 16
- %1 = load <4 x i32>* %a0, align 16
- %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
- ret <4 x i32> %2
-}
-
-define void @t01(double* %a0) nounwind ssp {
-entry:
-; CHECK_O0: movsd (%eax), %xmm0
-; CHECK_O0: unpcklpd %xmm0, %xmm0
- %tmp93 = load double* %a0, align 8
- %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1
- store <2 x double> %vecinit94, <2 x double>* undef
- ret void
-}
-
-define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
-entry:
-; CHECK: t02
-; CHECK: movaps
-; CHECK: shufps
-; CHECK: pshufd
-; CHECK: movq
-; CHECK: ret
- %0 = bitcast <8 x i32>* %source to <4 x i32>*
- %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
- %tmp2 = load <4 x i32>* %arrayidx, align 16
- %tmp3 = extractelement <4 x i32> %tmp2, i32 0
- %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0
- %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1
- %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*
- %tmp8 = load <4 x i32>* %1, align 16
- %tmp9 = extractelement <4 x i32> %tmp8, i32 1
- %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1
- store <2 x i32> %tmp11, <2 x i32>* %dest, align 8
- ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll
deleted file mode 100644
index ec196df..0000000
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpcklpd
- %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> zeroinitializer
- ret <2 x double> %shuffle
-}
-
-define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpckhpd
- %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1>
- ret <2 x double> %shuffle
-}
-
-define <2 x i64> @ldi(<2 x i64> %p) nounwind optsize ssp {
-; CHECK: punpcklqdq
- %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> zeroinitializer
- ret <2 x i64> %shuffle
-}
-
-define <2 x i64> @hdi(<2 x i64> %p) nounwind optsize ssp {
-; CHECK: punpckhqdq
- %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
- ret <2 x i64> %shuffle
-}
-
-; rdar://10050549
-%struct.Float2 = type { float, float }
-
-define <4 x float> @loadhpi(%struct.Float2* %vPtr, <4 x float> %vecin1) nounwind readonly ssp {
-entry:
-; CHECK: loadhpi
-; CHECK-NOT: movq
-; CHECK: movhps (
- %tmp1 = bitcast %struct.Float2* %vPtr to <1 x i64>*
- %addptr7 = getelementptr inbounds <1 x i64>* %tmp1, i64 0
- %tmp2 = bitcast <1 x i64>* %addptr7 to float*
- %tmp3 = load float* %tmp2, align 4
- %vec = insertelement <4 x float> undef, float %tmp3, i32 0
- %addptr.i12 = getelementptr inbounds float* %tmp2, i64 1
- %tmp4 = load float* %addptr.i12, align 4
- %vecin2 = insertelement <4 x float> %vec, float %tmp4, i32 1
- %shuffle = shufflevector <4 x float> %vecin1, <4 x float> %vecin2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
- ret <4 x float> %shuffle
-}
-
-; rdar://10119696
-; CHECK: f
-define <4 x float> @f(<4 x float> %x, double* nocapture %y) nounwind readonly ssp {
-entry:
- ; CHECK: movlps (%{{rdi|rdx}}), %xmm0
- %u110.i = load double* %y, align 1
- %tmp8.i = insertelement <2 x double> undef, double %u110.i, i32 0
- %tmp9.i = bitcast <2 x double> %tmp8.i to <4 x float>
- %shuffle.i = shufflevector <4 x float> %x, <4 x float> %tmp9.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
- ret <4 x float> %shuffle.i
-}
-
-define <4 x float> @loadhpi2(%struct.Float2* nocapture %vHiCoefPtr_0, %struct.Float2* nocapture %vLoCoefPtr_0, i32 %s) nounwind readonly ssp {
-entry:
-; CHECK: loadhpi2
-; CHECK: movhps (
-; CHECK-NOT: movlhps
- %0 = bitcast %struct.Float2* %vHiCoefPtr_0 to <1 x i64>*
- %idx.ext = sext i32 %s to i64
- %add.ptr = getelementptr inbounds <1 x i64>* %0, i64 %idx.ext
- %add.ptr.val = load <1 x i64>* %add.ptr, align 1
- %1 = bitcast <1 x i64> %add.ptr.val to <2 x float>
- %shuffle.i = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %2 = bitcast %struct.Float2* %vLoCoefPtr_0 to <1 x i64>*
- %add.ptr2 = getelementptr inbounds <1 x i64>* %2, i64 %idx.ext
- %add.ptr2.val = load <1 x i64>* %add.ptr2, align 1
- %3 = bitcast <1 x i64> %add.ptr2.val to <2 x float>
- %shuffle.i4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %shuffle1.i5 = shufflevector <4 x float> %shuffle.i, <4 x float> %shuffle.i4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
- ret <4 x float> %shuffle1.i5
-}
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
deleted file mode 100644
index 8fd9a5c..0000000
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
-; rdar://10050222, rdar://10134392
-
-define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: movlps (%rdi), %xmm0
-; CHECK: ret
- %p.val = load <1 x i64>* %p, align 1
- %0 = bitcast <1 x i64> %p.val to <2 x float>
- %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
- ret <4 x float> %shuffle1.i
-}
-
-define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
-entry:
-; CHECK-LABEL: t1a:
-; CHECK: movlps (%rdi), %xmm0
-; CHECK: ret
- %0 = bitcast <1 x i64>* %p to double*
- %1 = load double* %0
- %2 = insertelement <2 x double> undef, double %1, i32 0
- %3 = bitcast <2 x double> %2 to <4 x float>
- %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
- ret <4 x float> %4
-}
-
-define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: movlps %xmm0, (%rdi)
-; CHECK: ret
- %cast.i = bitcast <4 x float> %a to <2 x i64>
- %extract.i = extractelement <2 x i64> %cast.i, i32 0
- %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0
- store i64 %extract.i, i64* %0, align 8
- ret void
-}
-
-define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
-entry:
-; CHECK-LABEL: t2a:
-; CHECK: movlps %xmm0, (%rdi)
-; CHECK: ret
- %0 = bitcast <1 x i64>* %p to double*
- %1 = bitcast <4 x float> %a to <2 x double>
- %2 = extractelement <2 x double> %1, i32 0
- store double %2, double* %0
- ret void
-}
-
-; rdar://10436044
-define <2 x double> @t3() nounwind readonly {
-bb:
-; CHECK-LABEL: t3:
-; CHECK: movq (%rax), %xmm1
-; CHECK: punpcklqdq %xmm2, %xmm0
-; CHECK: movsd %xmm1, %xmm0
- %tmp0 = load i128* null, align 1
- %tmp1 = load <2 x i32>* undef, align 8
- %tmp2 = bitcast i128 %tmp0 to <16 x i8>
- %tmp3 = bitcast <2 x i32> %tmp1 to i64
- %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
- %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double>
- %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double>
- %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1>
- ret <2 x double> %tmp7
-}
-
-; rdar://10450317
-define <2 x i64> @t4() nounwind readonly {
-bb:
-; CHECK-LABEL: t4:
-; CHECK: movq (%rax), %xmm0
-; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]]
-; CHECK: movsd %[[XMM]], %xmm0
- %tmp0 = load i128* null, align 1
- %tmp1 = load <2 x i32>* undef, align 8
- %tmp2 = bitcast i128 %tmp0 to <16 x i8>
- %tmp3 = bitcast <2 x i32> %tmp1 to i64
- %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
- %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64>
- %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1>
- ret <2 x i64> %tmp6
-}
diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll
deleted file mode 100644
index 75b45e3..0000000
--- a/test/CodeGen/X86/vec_shuffle-40.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
-
-define void @shuffle_v16i16(<16 x i16>* %a) {
-; CHECK-LABEL: shuffle_v16i16:
-; CHECK: vpshufb {{.*}}%ymm
-; CHECK-NOT: vpshufb {{.*}}%xmm
-entry:
- %0 = load <16 x i16>* %a, align 32
- %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
- store <16 x i16> %shuffle, <16 x i16>* %a, align 32
- ret void
-}
-
-define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) {
-; CHECK-LABEL: shuffle_v16i16_lanecrossing:
-; CHECK-NOT: vpshufb {{.*}}%ymm
-entry:
- %0 = load <16 x i16>* %a, align 32
- %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
- store <16 x i16> %shuffle, <16 x i16>* %a, align 32
- ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll
deleted file mode 100644
index 28fdd2f..0000000
--- a/test/CodeGen/X86/vec_shuffle-41.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
-
-; Use buildFromShuffleMostly which allows this to be generated as two 128-bit
-; shuffles and an insert.
-
-; This is the (somewhat questionable) LLVM IR that is generated for:
-; x8.s0123456 = x8.s1234567; // x8 is a <8 x float> type
-; x8.s7 = f; // f is float
-
-
-define <8 x float> @test1(<8 x float> %a, float %b) {
-; CHECK-LABEL: test1:
-; CHECK: vinsertps
-; CHECK-NOT: vinsertps
-entry:
- %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
- %insert = insertelement <8 x float> %extend, float %b, i32 7
-
- ret <8 x float> %insert
-}
diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll
deleted file mode 100644
index 6599598..0000000
--- a/test/CodeGen/X86/vec_shuffle.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -mtriple=i686-linux -mcpu=core2 | FileCheck %s
-
-; CHECK: test_v4sf
-; CHECK: movq 8(%esp)
-; CHECK: pshufd $80
-define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1]
- %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1]
- %tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2 ; <<4 x float>> [#uses=1]
- %tmp6 = insertelement <4 x float> %tmp4, float %Y, i32 3 ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp6, <4 x float>* %P
- ret void
-}
-
-; CHECK: test_v2sd
-; CHECK: movups 8(%esp)
-; CHECK: movaps
-define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind {
- %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1]
- %tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1 ; <<2 x double>> [#uses=1]
- store <2 x double> %tmp2, <2 x double>* %P
- ret void
-}
-
-; CHECK: test_v8i16
-; CHECK: pshufhw $-58
-; CHECK: movdqa
-define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind {
- %tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1]
- %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8]
- %tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; <i16> [#uses=1]
- %tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 ; <i16> [#uses=1]
- %tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 ; <i16> [#uses=1]
- %tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 ; <i16> [#uses=1]
- %tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 ; <i16> [#uses=1]
- %tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 ; <i16> [#uses=1]
- %tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 ; <i16> [#uses=1]
- %tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 ; <i16> [#uses=1]
- %tmp8 = insertelement <8 x i16> undef, i16 %tmp.upgrd.2, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 ; <<8 x i16>> [#uses=1]
- %tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp2, i32 2 ; <<8 x i16>> [#uses=1]
- %tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 ; <<8 x i16>> [#uses=1]
- %tmp15.upgrd.3 = bitcast <8 x i16> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
- store <2 x i64> %tmp15.upgrd.3, <2 x i64>* %res
- ret void
-}
diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
deleted file mode 100644
index 9d82f97..0000000
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s
-
-define void @test(<2 x i64>* %P, i8 %x) nounwind {
- %tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1]
- %tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1 ; <<16 x i8>> [#uses=1]
- %tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2 ; <<16 x i8>> [#uses=1]
- %tmp40 = insertelement <16 x i8> %tmp38, i8 %x, i32 3 ; <<16 x i8>> [#uses=1]
- %tmp42 = insertelement <16 x i8> %tmp40, i8 %x, i32 4 ; <<16 x i8>> [#uses=1]
- %tmp44 = insertelement <16 x i8> %tmp42, i8 %x, i32 5 ; <<16 x i8>> [#uses=1]
- %tmp46 = insertelement <16 x i8> %tmp44, i8 %x, i32 6 ; <<16 x i8>> [#uses=1]
- %tmp48 = insertelement <16 x i8> %tmp46, i8 %x, i32 7 ; <<16 x i8>> [#uses=1]
- %tmp50 = insertelement <16 x i8> %tmp48, i8 %x, i32 8 ; <<16 x i8>> [#uses=1]
- %tmp52 = insertelement <16 x i8> %tmp50, i8 %x, i32 9 ; <<16 x i8>> [#uses=1]
- %tmp54 = insertelement <16 x i8> %tmp52, i8 %x, i32 10 ; <<16 x i8>> [#uses=1]
- %tmp56 = insertelement <16 x i8> %tmp54, i8 %x, i32 11 ; <<16 x i8>> [#uses=1]
- %tmp58 = insertelement <16 x i8> %tmp56, i8 %x, i32 12 ; <<16 x i8>> [#uses=1]
- %tmp60 = insertelement <16 x i8> %tmp58, i8 %x, i32 13 ; <<16 x i8>> [#uses=1]
- %tmp62 = insertelement <16 x i8> %tmp60, i8 %x, i32 14 ; <<16 x i8>> [#uses=1]
- %tmp64 = insertelement <16 x i8> %tmp62, i8 %x, i32 15 ; <<16 x i8>> [#uses=1]
- %tmp68 = load <2 x i64>* %P ; <<2 x i64>> [#uses=1]
- %tmp71 = bitcast <2 x i64> %tmp68 to <16 x i8> ; <<16 x i8>> [#uses=1]
- %tmp73 = add <16 x i8> %tmp71, %tmp64 ; <<16 x i8>> [#uses=1]
- %tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64> ; <<2 x i64>> [#uses=1]
- store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P
- ret void
-
-; CHECK-LABEL: test:
-; CHECK-NOT: pshufd
-; CHECK: punpcklbw
-; CHECK: punpcklbw
-; CHECK: pshufd $0
-; CHECK-NOT: pshufd
-}
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
deleted file mode 100644
index 754cbf4..0000000
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-; Splat test for v8i16
-define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_0:
-; CHECK: pshuflw $0
-}
-
-define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_1:
-; CHECK: pshuflw $5
-}
-
-define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_2:
-; CHECK: punpcklwd
-; CHECK-NEXT: pshufd $-86
-}
-
-define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_3:
-; CHECK: pshuflw $15
-}
-
-define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_4:
-; CHECK: movhlps
-}
-
-define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_5:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $85
-}
-
-define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_6:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $-86
-}
-
-define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
- %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_7:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $-1
-}
-
-; Splat test for v16i8
-define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_8:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_9:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_10:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_11:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
-}
-
-
-define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_12:
-; CHECK: pshufd $5
-}
-
-define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_13:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_14:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_15:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
-}
-
-define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_16:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_17:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_18:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_19:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
-}
-
-define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_20:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_21:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_22:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
- %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
- ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_23:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
-}
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
deleted file mode 100644
index 28f2a90..0000000
--- a/test/CodeGen/X86/vec_splat.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
-
-define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1]
- %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1]
- %tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1]
- %tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3 ; <<4 x float>> [#uses=1]
- %tmp8 = load <4 x float>* %Q ; <<4 x float>> [#uses=1]
- %tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp10, <4 x float>* %P
- ret void
-
-; SSE2-LABEL: test_v4sf:
-; SSE2: pshufd $0
-
-; SSE3-LABEL: test_v4sf:
-; SSE3: pshufd $0
-}
-
-define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
- %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1]
- %tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1]
- %tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1]
- %tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1]
- store <2 x double> %tmp6, <2 x double>* %P
- ret void
-
-; SSE2-LABEL: test_v2sd:
-; SSE2: shufpd $0
-
-; SSE3-LABEL: test_v2sd:
-; SSE3: movddup
-}
-
-; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
-define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
- %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
- %2 = load <4 x float>* %1, align 16
- %3 = trunc i64 %j to i32
- %4 = extractelement <4 x float> %2, i32 %3
- %5 = insertelement <4 x float> undef, float %4, i32 0
- %6 = insertelement <4 x float> %5, float %4, i32 1
- %7 = insertelement <4 x float> %6, float %4, i32 2
- %8 = insertelement <4 x float> %7, float %4, i32 3
- ret <4 x float> %8
-
-; AVX-LABEL: load_extract_splat
-; AVX-NOT: rsp
-; AVX: vbroadcastss
-}
-
-; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
-define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
- %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
- %2 = load <4 x float>* %1, align 16
- %3 = extractelement <4 x float> %2, i64 %j
- %4 = insertelement <4 x float> undef, float %3, i32 0
- %5 = insertelement <4 x float> %4, float %3, i32 1
- %6 = insertelement <4 x float> %5, float %3, i32 2
- %7 = insertelement <4 x float> %6, float %3, i32 3
- ret <4 x float> %7
-
-; AVX-LABEL: load_extract_splat1
-; AVX-NOT: movs
-; AVX: vbroadcastss
-}
diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll
new file mode 100644
index 0000000..3c446bb
--- /dev/null
+++ b/test/CodeGen/X86/vec_trunc_sext.ll
@@ -0,0 +1,30 @@
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - | FileCheck %s -check-prefix=NO_SSE_41
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - | FileCheck %s -check-prefix=SSE_41
+
+; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 )
+; When sexting a trunc'd vector value, we can't eliminate the zext.
+; If we don't have SSE4.1, use punpck.
+; If we have SSE4.1, use pmovzx because it combines the load op.
+; There may be a better way to do this using pshufb + pmovsx,
+; but that is beyond our current codegen capabilities.
+
+define <4 x i32> @trunc_sext(<4 x i16>* %in) {
+ %load = load <4 x i16>* %in
+ %trunc = trunc <4 x i16> %load to <4 x i8>
+ %sext = sext <4 x i8> %trunc to <4 x i32>
+ ret <4 x i32> %sext
+
+; NO_SSE_41-LABEL: trunc_sext:
+; NO_SSE_41: movq (%rdi), %xmm0
+; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0
+; NO_SSE_41-NEXT: pslld $24, %xmm0
+; NO_SSE_41-NEXT: psrad $24, %xmm0
+; NO_SSE_41-NEXT: retq
+
+; SSE_41-LABEL: trunc_sext:
+; SSE_41: pmovzxwd (%rdi), %xmm0
+; SSE_41-NEXT: pslld $24, %xmm0
+; SSE_41-NEXT: psrad $24, %xmm0
+; SSE_41-NEXT: retq
+}
+
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index ee20f1f..46cfcd9 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -1,11 +1,167 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s
+
+; Check that the constant used in the vectors are the right ones.
+; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]:
+; SSE-NEXT: .long 65535 ## 0xffff
+; SSE-NEXT: .long 65535 ## 0xffff
+; SSE-NEXT: .long 65535 ## 0xffff
+; SSE-NEXT: .long 65535 ## 0xffff
+
+; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long 1258291200 ## 0x4b000000
+; CST-NEXT: .long 1258291200 ## 0x4b000000
+; CST-NEXT: .long 1258291200 ## 0x4b000000
+; CST-NEXT: .long 1258291200 ## 0x4b000000
+
+; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long 1392508928 ## 0x53000000
+; CST-NEXT: .long 1392508928 ## 0x53000000
+; CST-NEXT: .long 1392508928 ## 0x53000000
+; CST-NEXT: .long 1392508928 ## 0x53000000
+
+; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long 3539992704 ## float -5.497642e+11
+; CST-NEXT: .long 3539992704 ## float -5.497642e+11
+; CST-NEXT: .long 3539992704 ## float -5.497642e+11
+; CST-NEXT: .long 3539992704 ## float -5.497642e+11
+
+; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long 1258291200 ## 0x4b000000
+
+; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long 1392508928 ## 0x53000000
+
+; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11
-; Test that we are not lowering uinttofp to scalars
define <4 x float> @test1(<4 x i32> %A) nounwind {
; CHECK-LABEL: test1:
-; CHECK-NOT: cvtsd2ss
-; CHECK: ret
+;
+; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]]
+; SSE-NEXT: pand %xmm0, [[MASK]]
+; After this instruction, MASK will have the value of the low parts
+; of the vector.
+; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0
+; SSE-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0
+; SSE-NEXT: addps [[MASK]], %xmm0
+; SSE-NEXT: retq
+;
+; Currently we commute the arguments of the first blend, but this could be
+; improved to match the lowering of the second blend.
+; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]]
+; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]]
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0
+; SSE41-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0
+; SSE41-NEXT: addps [[LOWVEC]], %xmm0
+; SSE41-NEXT: retq
+;
+; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]]
+; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
+; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
+; AVX-NEXT: vaddps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
+; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
+; AVX-NEXT: retq
+;
+; The lowering for AVX2 is a bit messy, because we select broadcast
+; instructions, instead of folding the constant loads.
+; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]]
+; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
+; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
+; AVX2-NEXT: retq
%C = uitofp <4 x i32> %A to <4 x float>
ret <4 x float> %C
}
+; Match the AVX2 constants used in the next function
+; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long 1258291200 ## 0x4b000000
+
+; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long 1392508928 ## 0x53000000
+
+; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11
+
+define <8 x float> @test2(<8 x i32> %A) nounwind {
+; CHECK-LABEL: test2:
+; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX.
+; The constant used for in the vector instruction are shared between the
+; two sequences of instructions.
+;
+; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
+; SSE-NEXT: pand %[[MASK]], [[VECLOW]]
+; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT: por %[[LOWCST]], [[VECLOW]]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
+; SSE-NEXT: por %[[HIGHCST]], %xmm0
+; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE-NEXT: addps %[[MAGICCST]], %xmm0
+; SSE-NEXT: addps [[VECLOW]], %xmm0
+; MASK is the low vector of the second part after this point.
+; SSE-NEXT: pand %xmm1, %[[MASK]]
+; SSE-NEXT: por %[[LOWCST]], %[[MASK]]
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %[[HIGHCST]], %xmm1
+; SSE-NEXT: addps %[[MAGICCST]], %xmm1
+; SSE-NEXT: addps %[[MASK]], %xmm1
+; SSE-NEXT: retq
+;
+; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
+; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]]
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
+; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0
+; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE41-NEXT: addps %[[MAGICCST]], %xmm0
+; SSE41-NEXT: addps [[VECLOW]], %xmm0
+; LOWCST is the low vector of the second part after this point.
+; The operands of the blend are inverted because we reuse xmm1
+; in the next shift.
+; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]]
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1
+; SSE41-NEXT: addps %[[MAGICCST]], %xmm1
+; SSE41-NEXT: addps %[[LOWCST]], %xmm1
+; SSE41-NEXT: retq
+;
+; Test that we are not lowering uinttofp to scalars
+; AVX-NOT: cvtsd2ss
+; AVX: retq
+;
+; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]]
+; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]]
+; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0
+; AVX2-NEXT: retq
+ %C = uitofp <8 x i32> %A to <8 x float>
+ ret <8 x float> %C
+}
+
+define <4 x double> @test3(<4 x i32> %arg) {
+; CHECK-LABEL: test3:
+; This test used to crash because we were custom lowering it as if it was
+; a conversion between <4 x i32> and <4 x float>.
+; AVX: vcvtdq2pd
+; AVX2: vcvtdq2pd
+; CHECK: retq
+ %tmp = uitofp <4 x i32> %arg to <4 x double>
+ ret <4 x double> %tmp
+}
diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
new file mode 100644
index 0000000..827d418
--- /dev/null
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+
+; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
+
+; Subtracting zero is free.
+define <4 x float> @vec_fsub_zero(<4 x float> %x) {
+; CHECK-LABEL: vec_fsub_zero:
+; CHECK-NOT: subps
+; CHECK-NOT: xorps
+; CHECK: retq
+ %sub = fsub <4 x float> %x, zeroinitializer
+ ret <4 x float> %sub
+}
+
+; Negating doesn't require subtraction.
+define <4 x float> @vec_fneg(<4 x float> %x) {
+; CHECK-LABEL: vec_fneg:
+; CHECK: xorps {{.*}}LCP{{.*}}, %xmm0
+; CHECK-NOT: subps
+; CHECK-NEXT: retq
+ %sub = fsub <4 x float> zeroinitializer, %x
+ ret <4 x float> %sub
+}
diff --git a/test/CodeGen/X86/vec_zext.ll b/test/CodeGen/X86/vec_zext.ll
deleted file mode 100644
index 615a50b..0000000
--- a/test/CodeGen/X86/vec_zext.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR 9267
-
-define<4 x i32> @func_16_32() {
- %F = load <4 x i16>* undef
- %G = zext <4 x i16> %F to <4 x i32>
- %H = load <4 x i16>* undef
- %Y = zext <4 x i16> %H to <4 x i32>
- %T = add <4 x i32> %Y, %G
- store <4 x i32>%T , <4 x i32>* undef
- ret <4 x i32> %T
-}
-
-define<4 x i64> @func_16_64() {
- %F = load <4 x i16>* undef
- %G = zext <4 x i16> %F to <4 x i64>
- %H = load <4 x i16>* undef
- %Y = zext <4 x i16> %H to <4 x i64>
- %T = xor <4 x i64> %Y, %G
- store <4 x i64>%T , <4 x i64>* undef
- ret <4 x i64> %T
-}
-
-define<4 x i64> @func_32_64() {
- %F = load <4 x i32>* undef
- %G = zext <4 x i32> %F to <4 x i64>
- %H = load <4 x i32>* undef
- %Y = zext <4 x i32> %H to <4 x i64>
- %T = or <4 x i64> %Y, %G
- ret <4 x i64> %T
-}
-
-define<4 x i16> @func_8_16() {
- %F = load <4 x i8>* undef
- %G = zext <4 x i8> %F to <4 x i16>
- %H = load <4 x i8>* undef
- %Y = zext <4 x i8> %H to <4 x i16>
- %T = add <4 x i16> %Y, %G
- ret <4 x i16> %T
-}
-
-define<4 x i32> @func_8_32() {
- %F = load <4 x i8>* undef
- %G = zext <4 x i8> %F to <4 x i32>
- %H = load <4 x i8>* undef
- %Y = zext <4 x i8> %H to <4 x i32>
- %T = sub <4 x i32> %Y, %G
- ret <4 x i32> %T
-}
-
-define<4 x i64> @func_8_64() {
- %F = load <4 x i8>* undef
- %G = zext <4 x i8> %F to <4 x i64>
- %H = load <4 x i8>* undef
- %Y = zext <4 x i8> %H to <4 x i64>
- %T = add <4 x i64> %Y, %G
- ret <4 x i64> %T
-}
-
-define<4 x i32> @const_16_32() {
- %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32>
- ret <4 x i32> %G
-}
-
-define<4 x i64> @const_16_64() {
- %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64>
- ret <4 x i64> %G
-}
-
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
new file mode 100644
index 0000000..0a3ed7e
--- /dev/null
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -0,0 +1,708 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; AVX128 tests:
+
+define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
+; SSE2-LABEL: vsel_float:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_float:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_float:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: vsel_float:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT: retq
+entry:
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
+ ret <4 x float> %vsel
+}
+
+define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
+; SSE-LABEL: vsel_float2:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vsel_float2:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+entry:
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+ ret <4 x float> %vsel
+}
+
+define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
+; SSE2-LABEL: vsel_4xi8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_4xi8:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_4xi8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: vsel_4xi8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_4xi8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT: retq
+entry:
+ %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
+ ret <4 x i8> %vsel
+}
+
+define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
+; SSE2-LABEL: vsel_4xi16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_4xi16:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_4xi16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: vsel_4xi16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_4xi16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: retq
+entry:
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
+ ret <4 x i16> %vsel
+}
+
+define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: vsel_i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: vsel_i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
+entry:
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+ ret <4 x i32> %vsel
+}
+
+define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
+; SSE-LABEL: vsel_double:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vsel_double:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+entry:
+ %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
+ ret <2 x double> %vsel
+}
+
+define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
+; SSE-LABEL: vsel_i64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vsel_i64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+entry:
+ %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
+ ret <2 x i64> %vsel
+}
+
+define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
+; SSE2-LABEL: vsel_8xi16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_8xi16:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_8xi16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: vsel_8xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT: retq
+entry:
+ %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
+ ret <8 x i16> %vsel
+}
+
+define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
+; SSE2-LABEL: vsel_i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_i8:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_i8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: vsel_i8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+entry:
+ %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
+ ret <16 x i8> %vsel
+}
+
+
+; AVX256 tests:
+
+define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
+; SSE-LABEL: vsel_float8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movss %xmm0, %xmm2
+; SSE-NEXT: movss %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vsel_float8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX-NEXT: retq
+entry:
+ %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
+ ret <8 x float> %vsel
+}
+
+define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
+; SSE-LABEL: vsel_i328:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movss %xmm0, %xmm2
+; SSE-NEXT: movss %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vsel_i328:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_i328:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+entry:
+ %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
+ ret <8 x i32> %vsel
+}
+
+define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
+; SSE2-LABEL: vsel_double8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsd %xmm0, %xmm4
+; SSE2-NEXT: movsd %xmm2, %xmm6
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: movaps %xmm7, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_double8:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsd %xmm0, %xmm4
+; SSSE3-NEXT: movsd %xmm2, %xmm6
+; SSSE3-NEXT: movaps %xmm4, %xmm0
+; SSSE3-NEXT: movaps %xmm5, %xmm1
+; SSSE3-NEXT: movaps %xmm6, %xmm2
+; SSSE3-NEXT: movaps %xmm7, %xmm3
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_double8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
+; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
+; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: movaps %xmm7, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: vsel_double8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
+; AVX-NEXT: retq
+entry:
+ %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
+ ret <8 x double> %vsel
+}
+
+define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
+; SSE2-LABEL: vsel_i648:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsd %xmm0, %xmm4
+; SSE2-NEXT: movsd %xmm2, %xmm6
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: movaps %xmm7, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: vsel_i648:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsd %xmm0, %xmm4
+; SSSE3-NEXT: movsd %xmm2, %xmm6
+; SSSE3-NEXT: movaps %xmm4, %xmm0
+; SSSE3-NEXT: movaps %xmm5, %xmm1
+; SSSE3-NEXT: movaps %xmm6, %xmm2
+; SSSE3-NEXT: movaps %xmm7, %xmm3
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: vsel_i648:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: movaps %xmm7, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: vsel_i648:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_i648:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT: retq
+entry:
+ %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
+ ret <8 x i64> %vsel
+}
+
+define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
+; SSE-LABEL: vsel_double4:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movsd %xmm0, %xmm2
+; SSE-NEXT: movsd %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vsel_double4:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX-NEXT: retq
+entry:
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
+ ret <4 x double> %vsel
+}
+
+define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: testa:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd %xmm1, %xmm2
+; SSE2-NEXT: cmplepd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm0
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: testa:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movapd %xmm1, %xmm2
+; SSSE3-NEXT: cmplepd %xmm0, %xmm2
+; SSSE3-NEXT: andpd %xmm2, %xmm0
+; SSSE3-NEXT: andnpd %xmm1, %xmm2
+; SSSE3-NEXT: orpd %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: testa:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: cmplepd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: testa:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+entry:
+ %max_is_x = fcmp oge <2 x double> %x, %y
+ %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %max
+}
+
+define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: testb:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd %xmm1, %xmm2
+; SSE2-NEXT: cmpnlepd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm0
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: testb:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movapd %xmm1, %xmm2
+; SSSE3-NEXT: cmpnlepd %xmm0, %xmm2
+; SSSE3-NEXT: andpd %xmm2, %xmm0
+; SSSE3-NEXT: andnpd %xmm1, %xmm2
+; SSSE3-NEXT: orpd %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: testb:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: cmpnlepd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: testb:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vcmpnlepd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+entry:
+ %min_is_x = fcmp ult <2 x double> %x, %y
+ %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %min
+}
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; SSE-LABEL: constant_blendvpd_avx:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movsd %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: constant_blendvpd_avx:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX-NEXT: retq
+entry:
+ %select = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+ ret <4 x double> %select
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; SSE2-LABEL: constant_blendvps_avx:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
+; SSE2-NEXT: andps %xmm4, %xmm2
+; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
+; SSE2-NEXT: andps %xmm5, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm3
+; SSE2-NEXT: andps %xmm5, %xmm1
+; SSE2-NEXT: orps %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: constant_blendvps_avx:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
+; SSSE3-NEXT: andps %xmm4, %xmm2
+; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
+; SSSE3-NEXT: andps %xmm5, %xmm0
+; SSSE3-NEXT: orps %xmm2, %xmm0
+; SSSE3-NEXT: andps %xmm4, %xmm3
+; SSSE3-NEXT: andps %xmm5, %xmm1
+; SSSE3-NEXT: orps %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: constant_blendvps_avx:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: constant_blendvps_avx:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; AVX-NEXT: retq
+entry:
+ %select = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+ ret <8 x float> %select
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; SSE2-LABEL: constant_pblendvb_avx2:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT: andps %xmm4, %xmm2
+; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE2-NEXT: andps %xmm5, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm3
+; SSE2-NEXT: andps %xmm5, %xmm1
+; SSE2-NEXT: orps %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: constant_pblendvb_avx2:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSSE3-NEXT: andps %xmm4, %xmm2
+; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSSE3-NEXT: andps %xmm5, %xmm0
+; SSSE3-NEXT: orps %xmm2, %xmm0
+; SSSE3-NEXT: andps %xmm4, %xmm3
+; SSSE3-NEXT: andps %xmm5, %xmm1
+; SSSE3-NEXT: orps %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: constant_pblendvb_avx2:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: pblendvb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_pblendvb_avx2:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_pblendvb_avx2:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %select = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+ ret <32 x i8> %select
+}
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+;; 4 tests for shufflevectors that optimize to blend + immediate
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: blend_shufflevector_4xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xfloat:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_shufflevector_4xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_shufflevector_4xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT: retq
+entry:
+ %select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x float> %select
+}
+
+define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: blend_shufflevector_8xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movss %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_shufflevector_8xfloat:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movss %xmm0, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: movaps %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_shufflevector_8xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_shufflevector_8xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
+; AVX-NEXT: retq
+entry:
+ %select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
+ ret <8 x float> %select
+}
+
+define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: blend_shufflevector_4xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsd %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xdouble:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsd %xmm0, %xmm2
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_shufflevector_4xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_shufflevector_4xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX-NEXT: retq
+entry:
+ %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x double> %select
+}
+
+define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: blend_shufflevector_4xi64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsd %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xi64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsd %xmm2, %xmm0
+; SSSE3-NEXT: movaps %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_shufflevector_4xi64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: blend_shufflevector_4xi64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_shufflevector_4xi64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
+entry:
+ %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ ret <4 x i64> %select
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index b6d43e9..4b269dc 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -1,221 +1,1255 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
-define <4 x i32> @test1(<4 x i32> %a) {
- %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
+target triple = "x86_64-unknown-unknown"
+define <4 x i32> @test1(<4 x i32> %a) {
; SSE41-LABEL: test1:
-; SSE41: pmuludq
-; SSE41: pshufd $49
-; SSE41: pmuludq
-; SSE41: shufps $-35
-; SSE41: psubd
-; SSE41: psrld $1
-; SSE41: padd
-; SSE41: psrld $2
-
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pmuludq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm1, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: psrld $1, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test1:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm1, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: psubd %xmm2, %xmm0
+; SSE-NEXT: psrld $1, %xmm0
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: psrld $2, %xmm0
+; SSE-NEXT: retq
+;
; AVX-LABEL: test1:
-; AVX: vpmuludq
-; AVX: vpshufd $49
-; AVX: vpmuludq
-; AVX: vshufps $-35
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
+; AVX: # BB#0:
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %div
}
define <8 x i32> @test2(<8 x i32> %a) {
+; SSE41-LABEL: test2:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm4, %xmm5
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT: psubd %xmm3, %xmm0
+; SSE41-NEXT: psrld $1, %xmm0
+; SSE41-NEXT: paddd %xmm3, %xmm0
+; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: pmuludq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm4, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT: psubd %xmm2, %xmm1
+; SSE41-NEXT: psrld $1, %xmm1
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: psrld $2, %xmm1
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test2:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT: psubd %xmm3, %xmm0
+; SSE-NEXT: psrld $1, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm0
+; SSE-NEXT: psrld $2, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: psubd %xmm2, %xmm1
+; SSE-NEXT: psrld $1, %xmm1
+; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: psrld $2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2:
+; AVX: # BB#0:
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsrld $1, %ymm0, %ymm0
+; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsrld $2, %ymm0, %ymm0
+; AVX-NEXT: retq
%div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %div
-
-; AVX-LABEL: test2:
-; AVX: vpbroadcastd
-; AVX: vpalignr $4
-; AVX: vpmuludq
-; AVX: vpmuludq
-; AVX: vpblendd $170
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
}
define <8 x i16> @test3(<8 x i16> %a) {
- %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
- ret <8 x i16> %div
-
; SSE41-LABEL: test3:
-; SSE41: pmulhuw
-; SSE41: psubw
-; SSE41: psrlw $1
-; SSE41: paddw
-; SSE41: psrlw $2
-
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: psubw %xmm1, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: paddw %xmm1, %xmm0
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test3:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: psrlw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: retq
+;
; AVX-LABEL: test3:
-; AVX: vpmulhuw
-; AVX: vpsubw
-; AVX: vpsrlw $1
-; AVX: vpaddw
-; AVX: vpsrlw $2
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %div
}
define <16 x i16> @test4(<16 x i16> %a) {
+; SSE41-LABEL: test4:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmulhuw %xmm2, %xmm3
+; SSE41-NEXT: psubw %xmm3, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm0
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pmulhuw %xmm1, %xmm2
+; SSE41-NEXT: psubw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: paddw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm1
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test4:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pmulhuw %xmm2, %xmm3
+; SSE-NEXT: psubw %xmm3, %xmm0
+; SSE-NEXT: psrlw $1, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm0
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: pmulhuw %xmm1, %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm1
+; SSE-NEXT: psrlw $1, %xmm1
+; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: psrlw $2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test4:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX-NEXT: retq
%div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
ret <16 x i16> %div
-
-; AVX-LABEL: test4:
-; AVX: vpmulhuw
-; AVX: vpsubw
-; AVX: vpsrlw $1
-; AVX: vpaddw
-; AVX: vpsrlw $2
-; AVX-NOT: vpmulhuw
}
define <8 x i16> @test5(<8 x i16> %a) {
- %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
- ret <8 x i16> %div
-
; SSE41-LABEL: test5:
-; SSE41: pmulhw
-; SSE41: psrlw $15
-; SSE41: psraw $1
-; SSE41: paddw
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $15, %xmm1
+; SSE41-NEXT: psraw $1, %xmm0
+; SSE41-NEXT: paddw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test5:
+; SSE: # BB#0:
+; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $15, %xmm1
+; SSE-NEXT: psraw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
; AVX-LABEL: test5:
-; AVX: vpmulhw
-; AVX: vpsrlw $15
-; AVX: vpsraw $1
-; AVX: vpaddw
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %div
}
define <16 x i16> @test6(<16 x i16> %a) {
+; SSE41-LABEL: test6:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE41-NEXT: pmulhw %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $15, %xmm3
+; SSE41-NEXT: psraw $1, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm0
+; SSE41-NEXT: pmulhw %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $15, %xmm2
+; SSE41-NEXT: psraw $1, %xmm1
+; SSE41-NEXT: paddw %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test6:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE-NEXT: pmulhw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrlw $15, %xmm3
+; SSE-NEXT: psraw $1, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm0
+; SSE-NEXT: pmulhw %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrlw $15, %xmm2
+; SSE-NEXT: psraw $1, %xmm1
+; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test6:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vpsrlw $15, %ymm0, %ymm1
+; AVX-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
ret <16 x i16> %div
-
-; AVX-LABEL: test6:
-; AVX: vpmulhw
-; AVX: vpsrlw $15
-; AVX: vpsraw $1
-; AVX: vpaddw
-; AVX-NOT: vpmulhw
}
define <16 x i8> @test7(<16 x i8> %a) {
- %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
- ret <16 x i8> %div
-
-; FIXME: scalarized
; SSE41-LABEL: test7:
-; SSE41: pext
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pextrb $0, %xmm0, %ecx
+; SSE41-NEXT: movsbl %cl, %ecx
+; SSE41-NEXT: imull $-109, %ecx, %edx
+; SSE41-NEXT: shrl $8, %edx
+; SSE41-NEXT: addb %dl, %cl
+; SSE41-NEXT: movb %cl, %dl
+; SSE41-NEXT: shrb $7, %dl
+; SSE41-NEXT: sarb $2, %cl
+; SSE41-NEXT: addb %dl, %cl
+; SSE41-NEXT: movzbl %cl, %ecx
+; SSE41-NEXT: movd %ecx, %xmm1
+; SSE41-NEXT: pinsrb $1, %eax, %xmm1
+; SSE41-NEXT: pextrb $2, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $2, %eax, %xmm1
+; SSE41-NEXT: pextrb $3, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $3, %eax, %xmm1
+; SSE41-NEXT: pextrb $4, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $4, %eax, %xmm1
+; SSE41-NEXT: pextrb $5, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $5, %eax, %xmm1
+; SSE41-NEXT: pextrb $6, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $6, %eax, %xmm1
+; SSE41-NEXT: pextrb $7, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $7, %eax, %xmm1
+; SSE41-NEXT: pextrb $8, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $8, %eax, %xmm1
+; SSE41-NEXT: pextrb $9, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $9, %eax, %xmm1
+; SSE41-NEXT: pextrb $10, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $10, %eax, %xmm1
+; SSE41-NEXT: pextrb $11, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $11, %eax, %xmm1
+; SSE41-NEXT: pextrb $12, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $12, %eax, %xmm1
+; SSE41-NEXT: pextrb $13, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $13, %eax, %xmm1
+; SSE41-NEXT: pextrb $14, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $14, %eax, %xmm1
+; SSE41-NEXT: pextrb $15, %xmm0, %eax
+; SSE41-NEXT: movsbl %al, %eax
+; SSE41-NEXT: imull $-109, %eax, %ecx
+; SSE41-NEXT: shrl $8, %ecx
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movb %al, %cl
+; SSE41-NEXT: shrb $7, %cl
+; SSE41-NEXT: sarb $2, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pinsrb $15, %eax, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test7:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm4
+; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: imull $-109, %eax, %ecx
+; SSE-NEXT: shrl $8, %ecx
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movb %cl, %al
+; SSE-NEXT: shrb $7, %al
+; SSE-NEXT: sarb $2, %cl
+; SSE-NEXT: addb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: retq
+;
; AVX-LABEL: test7:
-; AVX: pext
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: movsbl %al, %eax
+; AVX-NEXT: imull $-109, %eax, %ecx
+; AVX-NEXT: shrl $8, %ecx
+; AVX-NEXT: addb %cl, %al
+; AVX-NEXT: movb %al, %cl
+; AVX-NEXT: shrb $7, %cl
+; AVX-NEXT: sarb $2, %al
+; AVX-NEXT: addb %cl, %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %dl
+; AVX-NEXT: shrb $7, %dl
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movzbl %cl, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpextrb $2, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $3, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $4, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $5, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $6, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $7, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $8, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $9, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $10, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $11, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $12, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $13, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $14, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX-NEXT: movsbl %cl, %ecx
+; AVX-NEXT: imull $-109, %ecx, %edx
+; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
+; AVX-NEXT: shrl $8, %edx
+; AVX-NEXT: addb %dl, %cl
+; AVX-NEXT: movb %cl, %al
+; AVX-NEXT: shrb $7, %al
+; AVX-NEXT: sarb $2, %cl
+; AVX-NEXT: addb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %div
}
define <4 x i32> @test8(<4 x i32> %a) {
- %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
-
; SSE41-LABEL: test8:
-; SSE41: pmuldq
-; SSE41: pshufd $49
-; SSE41-NOT: pshufd $49
-; SSE41: pmuldq
-; SSE41: shufps $-35
-; SSE41: pshufd $-40
-; SSE41: padd
-; SSE41: psrld $31
-; SSE41: psrad $2
-; SSE41: padd
-
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmuldq %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $31, %xmm0
+; SSE41-NEXT: psrad $2, %xmm1
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; SSE-LABEL: test8:
-; SSE: psrad $31
-; SSE: pand
-; SSE: paddd
-; SSE: pmuludq
-; SSE: pshufd $49
-; SSE-NOT: pshufd $49
-; SSE: pmuludq
-; SSE: shufps $-35
-; SSE: pshufd $-40
-; SSE: psubd
-; SSE: padd
-; SSE: psrld $31
-; SSE: psrad $2
-; SSE: padd
-
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: psrad $2, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
; AVX-LABEL: test8:
-; AVX: vpmuldq
-; AVX: vpshufd $49
-; AVX-NOT: vpshufd $49
-; AVX: vpmuldq
-; AVX: vshufps $-35
-; AVX: vpshufd $-40
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
+; AVX: # BB#0:
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT: vpmuldq %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %div
}
define <8 x i32> @test9(<8 x i32> %a) {
+; SSE41-LABEL: test9:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: # kill: XMM0<def> XMM3<kill>
+; SSE41-NEXT: pmuldq %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm4, %xmm5
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT: paddd %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld $31, %xmm3
+; SSE41-NEXT: psrad $2, %xmm0
+; SSE41-NEXT: paddd %xmm3, %xmm0
+; SSE41-NEXT: pmuldq %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm4, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrld $31, %xmm2
+; SSE41-NEXT: psrad $2, %xmm1
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test9:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: psrad $31, %xmm5
+; SSE-NEXT: pand %xmm1, %xmm5
+; SSE-NEXT: paddd %xmm0, %xmm5
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm6, %xmm7
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: psubd %xmm5, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrld $31, %xmm3
+; SSE-NEXT: psrad $2, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: paddd %xmm4, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm6, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrld $31, %xmm2
+; SSE-NEXT: psrad $2, %xmm1
+; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test9:
+; AVX: # BB#0:
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vpsrld $31, %ymm0, %ymm1
+; AVX-NEXT: vpsrad $2, %ymm0, %ymm0
+; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %div
-
-; AVX-LABEL: test9:
-; AVX: vpalignr $4
-; AVX: vpbroadcastd
-; AVX: vpmuldq
-; AVX: vpmuldq
-; AVX: vpblendd $170
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
}
define <8 x i32> @test10(<8 x i32> %a) {
+; SSE41-LABEL: test10:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm4, %xmm5
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psubd %xmm3, %xmm5
+; SSE41-NEXT: psrld $1, %xmm5
+; SSE41-NEXT: paddd %xmm3, %xmm5
+; SSE41-NEXT: psrld $2, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE41-NEXT: pmulld %xmm3, %xmm5
+; SSE41-NEXT: psubd %xmm5, %xmm0
+; SSE41-NEXT: pmuludq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm4, %xmm5
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psubd %xmm2, %xmm4
+; SSE41-NEXT: psrld $1, %xmm4
+; SSE41-NEXT: paddd %xmm2, %xmm4
+; SSE41-NEXT: psrld $2, %xmm4
+; SSE41-NEXT: pmulld %xmm3, %xmm4
+; SSE41-NEXT: psubd %xmm4, %xmm1
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test10:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: psubd %xmm3, %xmm5
+; SSE-NEXT: psrld $1, %xmm5
+; SSE-NEXT: paddd %xmm3, %xmm5
+; SSE-NEXT: psrld $2, %xmm5
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm5
+; SSE-NEXT: pmuludq %xmm3, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT: psubd %xmm5, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psubd %xmm2, %xmm4
+; SSE-NEXT: psrld $1, %xmm4
+; SSE-NEXT: paddd %xmm2, %xmm4
+; SSE-NEXT: psrld $2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm4
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT: psubd %xmm4, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test10:
+; AVX: # BB#0:
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vpsrld $2, %ymm1, %ymm1
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %rem
-
-; AVX-LABEL: test10:
-; AVX: vpbroadcastd
-; AVX: vpalignr $4
-; AVX: vpmuludq
-; AVX: vpmuludq
-; AVX: vpblendd $170
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
-; AVX: vpmulld
}
define <8 x i32> @test11(<8 x i32> %a) {
+; SSE41-LABEL: test11:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm4, %xmm5
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT: paddd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psrld $31, %xmm5
+; SSE41-NEXT: psrad $2, %xmm3
+; SSE41-NEXT: paddd %xmm5, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7]
+; SSE41-NEXT: pmulld %xmm5, %xmm3
+; SSE41-NEXT: psubd %xmm3, %xmm0
+; SSE41-NEXT: pmuldq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm4, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrld $31, %xmm3
+; SSE41-NEXT: psrad $2, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: pmulld %xmm5, %xmm2
+; SSE41-NEXT: psubd %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test11:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pand %xmm0, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: psrad $31, %xmm6
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: paddd %xmm4, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: pmuludq %xmm2, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT: psubd %xmm6, %xmm7
+; SSE-NEXT: paddd %xmm0, %xmm7
+; SSE-NEXT: movdqa %xmm7, %xmm4
+; SSE-NEXT: psrld $31, %xmm4
+; SSE-NEXT: psrad $2, %xmm7
+; SSE-NEXT: paddd %xmm4, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm7
+; SSE-NEXT: pmuludq %xmm4, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT: psubd %xmm7, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: psrad $31, %xmm6
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: paddd %xmm3, %xmm6
+; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: psubd %xmm6, %xmm2
+; SSE-NEXT: paddd %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrld $31, %xmm3
+; SSE-NEXT: psrad $2, %xmm2
+; SSE-NEXT: paddd %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT: psubd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test11:
+; AVX: # BB#0:
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1
+; AVX-NEXT: vpsrld $31, %ymm1, %ymm2
+; AVX-NEXT: vpsrad $2, %ymm1, %ymm1
+; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %rem
-
-; AVX-LABEL: test11:
-; AVX: vpalignr $4
-; AVX: vpbroadcastd
-; AVX: vpmuldq
-; AVX: vpmuldq
-; AVX: vpblendd $170
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
-; AVX: vpmulld
}
define <2 x i16> @test12() {
+; SSE41-LABEL: test12:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: test12:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test12:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
%I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
%B9 = urem <2 x i16> %I9, %I9
ret <2 x i16> %B9
+}
-; AVX-LABEL: test12:
-; AVX: xorps
+define <4 x i32> @PR20355(<4 x i32> %a) {
+; SSE41-LABEL: PR20355:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm1
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: psrld $31, %xmm1
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE-LABEL: PR20355:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: paddd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: psubd %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $31, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR20355:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+ ret <4 x i32> %sdiv
}
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
new file mode 100644
index 0000000..7a329d7
--- /dev/null
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -0,0 +1,943 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+;
+; Just one 32-bit run to make sure we do reasonable things there.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
+
+define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: # kill: XMM0<def> XMM1<kill>
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_8i16_to_8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: # kill: XMM0<def> XMM1<kill>
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pslld $16, %xmm0
+; SSSE3-NEXT: psrad $16, %xmm0
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pslld $16, %xmm1
+; SSSE3-NEXT: psrad $16, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_8i16_to_8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmovzxwd %xmm1, %xmm0
+; SSE41-NEXT: pslld $16, %xmm0
+; SSE41-NEXT: psrad $16, %xmm0
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pslld $16, %xmm1
+; SSE41-NEXT: psrad $16, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_8i16_to_8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_8i16_to_8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_8i16_to_8i32:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0
+; X32-SSE41-NEXT: pslld $16, %xmm0
+; X32-SSE41-NEXT: psrad $16, %xmm0
+; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE41-NEXT: pslld $16, %xmm1
+; X32-SSE41-NEXT: psrad $16, %xmm1
+; X32-SSE41-NEXT: retl
+entry:
+ %B = sext <8 x i16> %A to <8 x i32>
+ ret <8 x i32>%B
+}
+
+define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_4i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_4i32_to_4i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT: movd %xmm1, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_4i32_to_4i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm3
+; SSE41-NEXT: movd %xmm1, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm3
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_4i32_to_4i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_4i32_to_4i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_4i32_to_4i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT: sarl $31, %ecx
+; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT: movd %xmm1, %eax
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %ecx
+; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %B = sext <4 x i32> %A to <4 x i64>
+ ret <4 x i64>%B
+}
+
+define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_test1:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_test1:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $16, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_test1:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: load_sext_test1:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_test1:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test2:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movd (%rdi), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_test2:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movd (%rdi), %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_test2:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: load_sext_test2:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_test2:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test3:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsbq 1(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movsbq (%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_test3:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsbq 1(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movsbq (%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_test3:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: load_sext_test3:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_test3:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <2 x i8>* %ptr
+ %Y = sext <2 x i8> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_test4:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movswq 2(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movswq (%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_test4:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movswq 2(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movswq (%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_test4:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: load_sext_test4:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_test4:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <2 x i16>* %ptr
+ %Y = sext <2 x i16> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_test5:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movslq 4(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movslq (%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_test5:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movslq 4(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movslq (%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_test5:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: load_sext_test5:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxdq (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_test5:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <2 x i32>* %ptr
+ %Y = sext <2 x i32> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test6:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movq (%rdi), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_test6:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movq (%rdi), %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: psraw $8, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_test6:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: load_sext_test6:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_test6:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i16>
+ ret <8 x i16>%Y
+}
+
+define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
+; SSE2-LABEL: sext_4i1_to_4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_4i1_to_4i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pslld $31, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT: movd %xmm1, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_4i1_to_4i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm3
+; SSE41-NEXT: movd %xmm1, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm3
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_4i1_to_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_4i1_to_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_4i1_to_4i64:
+; X32-SSE41: # BB#0:
+; X32-SSE41-NEXT: pslld $31, %xmm0
+; X32-SSE41-NEXT: psrad $31, %xmm0
+; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT: sarl $31, %ecx
+; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT: movd %xmm1, %eax
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %ecx
+; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT: retl
+ %extmask = sext <4 x i1> %mask to <4 x i64>
+ ret <4 x i64> %extmask
+}
+
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: sext_16i8_to_16i16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_16i8_to_16i16:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa (%rdi), %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: psllw $8, %xmm0
+; SSSE3-NEXT: psraw $8, %xmm0
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: psllw $8, %xmm1
+; SSSE3-NEXT: psraw $8, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_16i8_to_16i16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa (%rdi), %xmm1
+; SSE41-NEXT: pmovzxbw %xmm1, %xmm0
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: psraw $8, %xmm0
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT: psllw $8, %xmm1
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_16i8_to_16i16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_16i8_to_16i16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_16i16:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movdqa (%eax), %xmm1
+; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0
+; X32-SSE41-NEXT: psllw $8, %xmm0
+; X32-SSE41-NEXT: psraw $8, %xmm0
+; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE41-NEXT: psllw $8, %xmm1
+; X32-SSE41-NEXT: psraw $8, %xmm1
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
+; SSE2-LABEL: sext_4i8_to_4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pslld $24, %xmm0
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: cltq
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_4i8_to_4i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pslld $24, %xmm0
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT: movd %xmm1, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movd %xmm1, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT: movd %xmm0, %rax
+; SSSE3-NEXT: cltq
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_4i8_to_4i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pslld $24, %xmm0
+; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm3
+; SSE41-NEXT: movd %xmm1, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm3
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: cltq
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_4i8_to_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_4i8_to_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_4i8_to_4i64:
+; X32-SSE41: # BB#0:
+; X32-SSE41-NEXT: pslld $24, %xmm0
+; X32-SSE41-NEXT: psrad $24, %xmm0
+; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT: sarl $31, %ecx
+; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT: movd %xmm1, %eax
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %ecx
+; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT: retl
+ %extmask = sext <4 x i8> %mask to <4 x i64>
+ ret <4 x i64> %extmask
+}
+
+define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movd (%rdi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movd (%rdi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT: pmovzxdq %xmm1, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movsbq %al, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movsbq %al, %rax
+; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: movsbq %al, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm1, %rax
+; SSE41-NEXT: movsbq %al, %rax
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movd (%eax), %xmm0
+; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1
+; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: movsbl %al, %eax
+; X32-SSE41-NEXT: movd %eax, %xmm0
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT: movsbl %al, %eax
+; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: movsbl %al, %eax
+; X32-SSE41-NEXT: movd %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT: movsbl %al, %eax
+; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
+
+define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movq (%rdi), %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm2, %rax
+; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movq (%rdi), %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT: movd %xmm2, %rax
+; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movq (%rdi), %xmm0
+; SSE41-NEXT: pmovzxwd %xmm0, %xmm1
+; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movswq %ax, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movswq %ax, %rax
+; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: movswq %ax, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm1, %rax
+; SSE41-NEXT: movswq %ax, %rax
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_sext_4i16_to_4i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_4i16_to_4i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movsd (%eax), %xmm0
+; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1
+; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: cwtl
+; X32-SSE41-NEXT: movd %eax, %xmm0
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT: cwtl
+; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT: movd %xmm2, %eax
+; X32-SSE41-NEXT: cwtl
+; X32-SSE41-NEXT: movd %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT: cwtl
+; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; X32-SSE41-NEXT: sarl $31, %eax
+; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 4da7e42..30ad366 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1,196 +1,1110 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; FIXME: SSE2 should look like the following:
+; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; FIXME: # BB#0:
+; FIXME-NEXT: punpcklbw %xmm0, %xmm0
+; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT: retq
+;
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; FIXME: SSE2 should be the following:
+; FIXME-LABEL: @shuffle_v16i8_0101010101010101
+; FIXME: # BB#0:
+; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT: retq
+;
+; SSE2-LABEL: shuffle_v16i8_0101010101010101:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_0101010101010101:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_0101010101010101:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v16i8_0101010101010101:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_0101010101010101:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23
-; CHECK-SSE2: punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %shuffle
}
+define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x i8> %shuffle
+}
+
define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
-; CHECK-SSE2: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: packuswb %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
-; CHECK-SSE2: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
-; CHECK-SSE2: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1]
-; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: movsd %xmm4, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: movsd %xmm0, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
ret <16 x i8> %shuffle
}
-define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle
-; CHECK-SSE2: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
- %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
+define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
+; SSE2-LABEL: trunc_v4i32_shuffle:
+; SSE2: # BB#0:
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc_v4i32_shuffle:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc_v4i32_shuffle:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v4i32_shuffle:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
+; We don't have anything useful to check here. This generates 100s of
+; instructions. Instead, just make sure we survived codegen.
+; ALL-LABEL: stress_test0:
+; ALL: retq
+entry:
+ %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
+ %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
+ %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
+ %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
+ %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
+ %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
+ %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
+ %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
+ %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+ %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
+ ret <16 x i8> %s.16.0
+}
+
+define <16 x i8> @stress_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
+; There is nothing interesting to check about these instructions other than
+; that they survive codegen. However, we actually do better and delete all of
+; them because the result is 'undef'.
+;
+; ALL-LABEL: stress_test1:
+; ALL: # BB#0: # %entry
+; ALL-NEXT: retq
+entry:
+ %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
+ %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
+ %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
+ %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
+ %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
+ %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
+ %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
+ %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
+ %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
+ %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
+
+ ret <16 x i8> %s.12.4
+}
+
+define <16 x i8> @PR20540(<8 x i8> %a) {
+; SSE2-LABEL: PR20540:
+; SSE2: # BB#0:
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: PR20540:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: PR20540:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: PR20540:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: movzbl %dil, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %a = insertelement <16 x i8> undef, i8 %i, i32 0
+ %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: movzbl %dil, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %a = insertelement <16 x i8> undef, i8 %i, i32 0
+ %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX-NEXT: retq
+ %a = insertelement <16 x i8> undef, i8 %i, i32 0
+ %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: movzbl %dil, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %edi, %xmm0
+; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %a = insertelement <16 x i8> undef, i8 %i, i32 3
+ %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE2: # BB#0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE41: # BB#0:
+; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; AVX: # BB#0:
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i8> %shuffle
}
-define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle
-; CHECK-SSE2: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
ret <16 x i8> %shuffle
}
-define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pand
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0
-; CHECK-SSE2-NEXT: retq
- %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
ret <16 x i8> %shuffle
}
+
+define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: packuswb %xmm0, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT: retq
+entry:
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
+
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
+; Nothing interesting to test here. Just make sure we didn't crashe.
+; ALL-LABEL: stress_test2:
+; ALL: retq
+entry:
+ %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
+ %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
+ %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
+
+ ret <16 x i8> %s.2.0
+}
+
+define void @constant_gets_selected() {
+; ALL-LABEL: constant_gets_selected:
+; ALL-NOT movd $0, {{%xmm[0-9]+}}
+ %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
+ %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+ %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
+ store <4 x i32> %weirder_zero, <4 x i32>* undef, align 16
+ store <4 x i32> zeroinitializer, <4 x i32>* undef, align 16
+ ret void
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 78b4ee7..9affee9 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1,219 +1,1138 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_00
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_00:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_10
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_10:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_10:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_11
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_11:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_11:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_22
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_22:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_22:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_22:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0
+; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_32
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_32:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_32:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_33
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_33:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_33:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3>
ret <2 x i64> %shuffle
}
define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2f64_00:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_00:
+; SSE3: # BB#0:
+; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_00:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_00:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_00:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_10
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2f64_10:
+; SSE: # BB#0:
+; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_10:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2f64_11:
+; SSE: # BB#0:
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_11:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-; of a mov?
+; SSE2-LABEL: shuffle_v2f64_22:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_22:
+; SSE3: # BB#0:
+; SSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
;
-; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSSE3-LABEL: shuffle_v2f64_22:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3-NEXT: movapd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_22:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_22:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_32
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2f64_32:
+; SSE: # BB#0:
+; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_32:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2f64_33:
+; SSE: # BB#0:
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_33:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
ret <2 x double> %shuffle
}
+define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: shuffle_v2f64_03:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_03:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm0, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_03:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_03:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_03:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: shuffle_v2f64_21:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_21:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_21:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_21:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_21:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
+ ret <2 x double> %shuffle
+}
define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_02
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_02:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_02:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_02_copy:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_02_copy:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_03
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_03:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_03:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm0, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_03:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_03:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_03:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_03:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_03_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_03_copy:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_03_copy:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm1, %xmm2
+; SSE3-NEXT: movaps %xmm2, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_03_copy:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm2
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_03_copy:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_03_copy:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_03_copy:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_12
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_12:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_12:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_12:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_12:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_12:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_12_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_12_copy:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_12_copy:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_12_copy:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_12_copy:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_12_copy:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_13
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_13:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_13:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_13_copy:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_13_copy:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_20
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_20:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_20:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0]
-; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_20_copy:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_20_copy:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_21
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_21:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_21:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_21:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_21:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_21:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_21:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_21_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_21_copy:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_21_copy:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm2, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_21_copy:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm2, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_21_copy:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_21_copy:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_21_copy:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
+; AVX2-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_30
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_30:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_30:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_30:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_30:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_30:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_30_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0]
-; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v2i64_30_copy:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_30_copy:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
+; SSE3-NEXT: movapd %xmm2, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_30_copy:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_30_copy:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_30_copy:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_31
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_31:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_31:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy
-; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_31_copy:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_31_copy:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1]
+; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
ret <2 x i64> %shuffle
}
+
+define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_0z:
+; SSE: # BB#0:
+; SSE-NEXT: movq %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_0z:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_1z:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_1z:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_z0:
+; SSE: # BB#0:
+; SSE-NEXT: movq %xmm0, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2i64_z0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
+; SSE2-LABEL: shuffle_v2i64_z1:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2i64_z1:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2i64_z1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2i64_z1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v2i64_z1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2i64_z1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_0z:
+; SSE: # BB#0:
+; SSE-NEXT: movq %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_0z:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_1z:
+; SSE: # BB#0:
+; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_1z:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_z0:
+; SSE: # BB#0:
+; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_z0:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
+; SSE2-LABEL: shuffle_v2f64_z1:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_z1:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_z1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_z1:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorpd %xmm1, %xmm1
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v2f64_z1:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+ ret <2 x double> %shuffle
+}
+
+define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
+; SSE-LABEL: insert_reg_and_zero_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movd %rdi, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_and_zero_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %rdi, %xmm0
+; AVX-NEXT: retq
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movq (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_and_zero_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq (%rdi), %xmm0
+; AVX-NEXT: retq
+ %a = load i64* %ptr
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
+; SSE-LABEL: insert_reg_and_zero_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_and_zero_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_and_zero_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd (%rdi), %xmm0
+; AVX-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuffle
+}
+
+define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
+; SSE2-LABEL: insert_reg_lo_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %rdi, %xmm1
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_reg_lo_v2i64:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %rdi, %xmm1
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_reg_lo_v2i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %rdi, %xmm1
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_reg_lo_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %rdi, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_reg_lo_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq %rdi, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_reg_lo_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq %rdi, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
+; SSE2-LABEL: insert_mem_lo_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_mem_lo_v2i64:
+; SSE3: # BB#0:
+; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_mem_lo_v2i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_mem_lo_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movq (%rdi), %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_mem_lo_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq (%rdi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_mem_lo_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %a = load i64* %ptr
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
+; SSE-LABEL: insert_reg_hi_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movd %rdi, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_hi_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %rdi, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
+; SSE-LABEL: insert_mem_hi_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movq (%rdi), %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_hi_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq (%rdi), %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %a = load i64* %ptr
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+ ret <2 x i64> %shuffle
+}
+
+define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
+; SSE-LABEL: insert_reg_lo_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_lo_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
+; SSE-LABEL: insert_mem_lo_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_lo_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
+; SSE-LABEL: insert_reg_hi_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_hi_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
+; SSE-LABEL: insert_mem_hi_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_hi_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; FIXME: We should match movddup for SSE3 and higher here.
+;
+; SSE2-LABEL: insert_dup_reg_v2f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_dup_reg_v2f64:
+; SSE3: # BB#0:
+; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_dup_reg_v2f64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_dup_reg_v2f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_dup_reg_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT: retq
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v2f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd (%rdi), %xmm0
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_dup_mem_v2f64:
+; SSE3: # BB#0:
+; SSE3-NEXT: movddup (%rdi), %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_dup_mem_v2f64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movddup (%rdi), %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_dup_mem_v2f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movddup (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_dup_mem_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup (%rdi), %xmm0
+; AVX-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) {
+; SSE-LABEL: shuffle_mem_v2f64_10:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm0
+; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_mem_v2f64_10:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; AVX-NEXT: retq
+ %a = load <2 x double>* %ptr
+ %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 7d496fa..833b822 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1,170 +1,1386 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0001
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,0,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0001:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0001:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0020
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,0,2,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0020:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0020:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
ret <4 x i32> %shuffle
}
+define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_0112:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0112:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+ ret <4 x i32> %shuffle
+}
define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0300
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,3,0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0300:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0300:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_1000
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[1,0,0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_1000:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_1000:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_2200
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,2,0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_2200:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_2200:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_3330
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[3,3,3,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_3330:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_3330:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_3210
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_3210:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_3210:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x i32> %shuffle
}
+define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_2121:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_2121:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
+ ret <4 x i32> %shuffle
+}
+
define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0001
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,0,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_0001:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0001:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0020
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,2,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_0020:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0020:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0300
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,3,0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_0300:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0300:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_1000
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[1,0,0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_1000:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_1000:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_2200
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[2,2,0,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_2200:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_2200:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_3330
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[3,3,3,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_3330:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_3330:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_3210
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4f32_3210:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_3210:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_0011:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0011:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_2233:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_2233:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_0022:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_0022:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0022:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_0022:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0022:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_1133:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_1133:
+; SSE3: # BB#0:
+; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_1133:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_1133:
+; SSE41: # BB#0:
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_1133:
+; AVX: # BB#0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %shuffle
+}
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0124
-; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v4i32_0124:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_0124:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0124:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_0124:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0124:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0142
-; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,2]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0142:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0142:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0412
-; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[2,0],xmm0[1,2]
-; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0412:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0412:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,2]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4012
-; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2]
-; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_4012:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_4012:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,2]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0145
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0145:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0145:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0451
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
-; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,3,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_0451:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0451:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4501
-; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_4501:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_4501:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4015
-; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
-; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0,1,3]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v4i32_4015:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_4015:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
ret <4 x i32> %shuffle
}
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_4zzz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_4zzz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movss %xmm0, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_4zzz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_4zzz:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_4zzz:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_z4zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_z4zz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_z4zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_z4zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_z4zz:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zz4z:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_zz4z:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zz4z:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_zz4z:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_zz4z:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zuu4:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_zuu4:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zuu4:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_zuu4:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_zuu4:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zzz7:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_zzz7:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zzz7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_zzz7:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_zzz7:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_z6zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4f32_z6zz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4f32_z6zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4f32_z6zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_z6zz:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_4zzz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_4zzz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movss %xmm0, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_4zzz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_4zzz:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_4zzz:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_z4zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_z4zz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movss %xmm0, %xmm1
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_z4zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_z4zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_z4zz:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_zz4z:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_zz4z:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movss %xmm0, %xmm1
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_zz4z:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_zz4z:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_zz4z:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_zuu4:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_zuu4:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movss %xmm0, %xmm1
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_zuu4:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_zuu4:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_zuu4:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_z6zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_z6zz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_z6zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_z6zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_z6zz:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_7012:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_7012:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_7012:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_7012:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_7012:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_6701:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_6701:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_6701:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_6701:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_6701:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_5670:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_5670:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_5670:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_5670:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_5670:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_1234:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_1234:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_1234:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_1234:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_1234:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_2345:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_2345:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_2345:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_2345:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_2345:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_3456:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_3456:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_3456:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_3456:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_3456:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_0u1u:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_0u1u:
+; SSE3: # BB#0:
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0u1u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_0u1u:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0u1u:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_0z1z:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_0z1z:
+; SSE3: # BB#0:
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0z1z:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_0z1z:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_0z1z:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
+; SSE-LABEL: insert_reg_and_zero_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movd %edi, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_and_zero_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: retq
+ %v = insertelement <4 x i32> undef, i32 %a, i32 0
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_and_zero_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd (%rdi), %xmm0
+; AVX-NEXT: retq
+ %a = load i32* %ptr
+ %v = insertelement <4 x i32> undef, i32 %a, i32 0
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE2-LABEL: insert_reg_and_zero_v4f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_reg_and_zero_v4f32:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movss %xmm0, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_reg_and_zero_v4f32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_reg_and_zero_v4f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_reg_and_zero_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %v = insertelement <4 x float> undef, float %a, i32 0
+ %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_and_zero_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX-NEXT: retq
+ %a = load float* %ptr
+ %v = insertelement <4 x float> undef, float %a, i32 0
+ %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
+; SSE2-LABEL: insert_reg_lo_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %rdi, %xmm1
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_reg_lo_v4i32:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %rdi, %xmm1
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_reg_lo_v4i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %rdi, %xmm1
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_reg_lo_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %rdi, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_reg_lo_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq %rdi, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_reg_lo_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq %rdi, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %a.cast = bitcast i64 %a to <2 x i32>
+ %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE2-LABEL: insert_mem_lo_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_mem_lo_v4i32:
+; SSE3: # BB#0:
+; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_mem_lo_v4i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_mem_lo_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movq (%rdi), %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_mem_lo_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq (%rdi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_mem_lo_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %a = load <2 x i32>* %ptr
+ %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
+; SSE-LABEL: insert_reg_hi_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movd %rdi, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_hi_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %rdi, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %a.cast = bitcast i64 %a to <2 x i32>
+ %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE-LABEL: insert_mem_hi_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movq (%rdi), %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_hi_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq (%rdi), %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %a = load <2 x i32>* %ptr
+ %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_lo_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_lo_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %a.cast = bitcast double %a to <2 x float>
+ %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_lo_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_lo_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = load <2 x float>* %ptr
+ %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_hi_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_hi_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
+ %a.cast = bitcast double %a to <2 x float>
+ %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_hi_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_hi_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = load <2 x float>* %ptr
+ %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
+; SSE-LABEL: shuffle_mem_v4f32_3210:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_mem_v4f32_3210:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; AVX-NEXT: retq
+ %a = load <4 x float>* %ptr
+ %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 5d1922a..59af434 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1,493 +1,1941 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_01012323
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,0,1,1]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_01012323:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_01012323:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_67452301
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_67452301:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_67452301:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_456789AB
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_456789AB:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_456789AB:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_456789AB:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_456789AB:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00000000
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_00000000:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_00000000:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_00000000:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v8i16_00000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i16_00000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00004444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_00004444:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_00004444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
+define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_u0u1u2u3:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u0u1u2u3:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
+ ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_u4u5u6u7:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u4u5u6u7:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
+ ret <8 x i16> %shuffle
+}
define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_31206745:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_31206745:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44440000
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_44440000:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44440000:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_44440000:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_44440000:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
+define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23016745:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_23016745:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+ ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23026745:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_23026745:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>
+ ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23016747:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_23016747:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>
+ ret <8 x i16> %shuffle
+}
define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_75643120
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_75643120:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_75643120:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_75643120:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_75643120:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_10545410
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_10545410:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_10545410:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_10545410:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_10545410:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_54105410
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_54105410:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_54105410:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_54105410:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_54105410:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_54101054
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_54101054:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_54101054:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_54101054:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_54101054:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04400440
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_04400440:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04400440:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_04400440:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_04400440:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_40044004
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_40044004:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_40044004:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_40044004:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_40044004:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_26405173
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_26405173:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_26405173:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_26405173:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_26405173:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_20645173
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_20645173:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_20645173:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_20645173:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_20645173:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_26401375
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_26401375:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_26401375:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_26401375:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_26401375:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
ret <8 x i16> %shuffle
}
+define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_66751643:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_66751643:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_66751643:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_66751643:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_60514754:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,5,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_60514754:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_60514754:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_60514754:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4>
+ ret <8 x i16> %shuffle
+}
+
define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00444444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_00444444:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_00444444:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_00444444:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_00444444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44004444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_44004444:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44004444:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_44004444:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_44004444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04404444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_04404444:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04404444:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_04404444:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_04404444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04400000
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_04400000:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04400000:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_04400000:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_04400000:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04404567
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_04404567:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_04404567:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0X444444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_0X444444:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0X444444:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0X444444:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0X444444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44X04444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_44X04444:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44X04444:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_44X04444:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_44X04444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_X4404444
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_X4404444:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_X4404444:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_X4404444:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_X4404444:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0127XXXX
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_0127XXXX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0127XXXX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0127XXXX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0127XXXX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXX4563
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_XXXX4563:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXXX4563:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXXX4563:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_XXXX4563:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_4563XXXX
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_4563XXXX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_4563XXXX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_4563XXXX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_4563XXXX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_01274563
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_01274563:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_01274563:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_01274563:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_01274563:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_45630127
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_45630127:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_45630127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_45630127:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_45630127:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
ret <8 x i16> %shuffle
}
+define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_37102735:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_37102735:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_37102735:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_37102735:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5>
+ ret <8 x i16> %shuffle
+}
+
define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_08192a3b
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_08192a3b:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_08192a3b:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d2e3f
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_0c1d2e3f:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_4c5d6e7f
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_4c5d6e7f:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_4c5d6e7f:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_48596a7b
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_48596a7b:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_48596a7b:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_08196e7f
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_08196e7f:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_08196e7f:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_0c1d6879:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0c1d6879:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_109832ba
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklqdq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_109832ba:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_109832ba:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_8091a2b3
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_8091a2b3:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_8091a2b3:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_c4d5e6f7
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_c4d5e6f7:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_c4d5e6f7:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0213cedf
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklqdq %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_0213cedf:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0213cedf:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
ret <8 x i16> %shuffle
}
+define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_443aXXXX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_443aXXXX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_443aXXXX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_032dXXXX
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_032dXXXX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_032dXXXX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_032dXXXX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
-define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXcXXXX
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
+define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_XXXdXXXX:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_XXXdXXXX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_012dXXXX
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_012dXXXX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_012dXXXX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_012dXXXX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXXcde3
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_cde3XXXX
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_012dcde3
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT: punpckhwd %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpcklwd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; CHECK-SSE2-NEXT: punpcklqdq %xmm1, %xmm0
-; CHECK-SSE2-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_012dcde3:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_012dcde3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_012dcde3:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: shuffle_v8i16_012dcde3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i16_012dcde3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
+
+define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_XXX1X579:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXX1X579:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_XXX1X579:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_8zzzzzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl %di, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_8zzzzzzz:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl %di, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: retq
+ %a = insertelement <8 x i16> undef, i16 %i, i32 0
+ %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl %di, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl %di, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: retq
+ %a = insertelement <8 x i16> undef, i16 %i, i32 0
+ %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl %di, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl %di, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %a = insertelement <8 x i16> undef, i16 %i, i32 0
+ %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl %di, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl %di, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX-NEXT: retq
+ %a = insertelement <8 x i16> undef, i16 %i, i32 0
+ %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl %di, %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl %di, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT: retq
+ %a = insertelement <8 x i16> undef, i16 %i, i32 3
+ %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_def01234:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_def01234:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_def01234:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_def01234:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_ueuu123u:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_ueuu123u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_ueuu123u:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_ueuu123u:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_56701234:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_56701234:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_56701234:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_56701234:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u6uu123u:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u6uu123u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_u6uu123u:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u6uu123u:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_uuuu123u:
+; SSE2: # BB#0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_uuuu123u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_uuuu123u:
+; SSE41: # BB#0:
+; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_uuuu123u:
+; AVX: # BB#0:
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_bcdef012:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_bcdef012:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_bcdef012:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_bcdef012:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_ucdeuu1u:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_34567012:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_34567012:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_34567012:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_34567012:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uu1u:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uu1u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uu1u:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uu1u:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uuuu:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uuuu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uuuu:
+; SSE41: # BB#0:
+; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_3456789a:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_3456789a:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_3456789a:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_3456789a:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uu9u:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uu9u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uu9u:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uu9u:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_56789abc:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_56789abc:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_56789abc:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_56789abc:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u6uu9abu:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u6uu9abu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_u6uu9abu:
+; SSE41: # BB#0:
+; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_u6uu9abu:
+; AVX: # BB#0:
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+ ret <8 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
new file mode 100644
index 0000000..4db0280
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -0,0 +1,1267 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
+ ret <16 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
new file mode 100644
index 0000000..79c906b
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -0,0 +1,1562 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $15, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: movl $15, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: movl $128, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: movl $15, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vinserti128 $0, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $15, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2: # BB#0:
+; AVX2-NEXT: movl $15, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,8,9,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u,31,30,29,28,27,26,25,24]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,7,6,5,4,3,2,1,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $15, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $15, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u]
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
+; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15]
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
+; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
+; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15]
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
+ ret <32 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
new file mode 100644
index 0000000..0bd1bd9
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -0,0 +1,748 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_0000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0001:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_0001:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0020:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_0020:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0300:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_0300:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_1000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_1000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_2200:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_2200:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3330:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_3330:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_3210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0023:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0022:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1032:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1133:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1023:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1022:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0423:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_0423:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0462:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0426:
+; ALL: # BB#0:
+; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1537:
+; ALL: # BB#0:
+; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4062:
+; ALL: # BB#0:
+; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_5173:
+; ALL: # BB#0:
+; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_5163:
+; ALL: # BB#0:
+; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0527:
+; ALL: # BB#0:
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4163:
+; ALL: # BB#0:
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0145:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4501:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0167:
+; ALL: # BB#0:
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0001:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0001:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0020:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0020:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0112:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0112:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0300:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0300:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_1000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_2200:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_2200:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3330:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_3330:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_3210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0124:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0124:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0142:
+; AVX1: # BB#0:
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0142:
+; AVX2: # BB#0:
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0412:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0412:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_4012:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_4012:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_0145:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0451:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0451:
+; AVX2: # BB#0:
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_4501:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_4015:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_4015:
+; AVX2: # BB#0:
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_2u35:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_2u35:
+; AVX2: # BB#0:
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1251:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_1251:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: stress_test1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: stress_test1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[3,1,1,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
+; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-NEXT: retq
+ %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+ %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef>
+ %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef>
+ %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0>
+
+ ret <4 x i64> %f
+}
+
+define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
+; AVX1-LABEL: insert_reg_and_zero_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_reg_and_zero_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT: retq
+ %v = insertelement <4 x i64> undef, i64 %a, i64 0
+ %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
+; AVX1-LABEL: insert_mem_and_zero_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq (%rdi), %xmm0
+; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT: retq
+ %a = load i64* %ptr
+ %v = insertelement <4 x i64> undef, i64 %a, i64 0
+ %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
+; ALL-LABEL: insert_reg_and_zero_v4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; ALL-NEXT: retq
+ %v = insertelement <4 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
+; ALL-LABEL: insert_mem_and_zero_v4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd (%rdi), %xmm0
+; ALL-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <4 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64(double* %ptr) {
+; ALL-LABEL: splat_mem_v4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <4 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
+; AVX1-LABEL: splat_mem_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovddup (%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splat_mem_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT: retq
+ %a = load i64* %ptr
+ %v = insertelement <4 x i64> undef, i64 %a, i64 0
+ %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64_2(double* %p) {
+; ALL-LABEL: splat_mem_v4f64_2:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT: retq
+ %1 = load double* %p
+ %2 = insertelement <2 x double> undef, double %1, i32 0
+ %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %3
+}
+
+define <4 x double> @splat_v4f64(<2 x double> %r) {
+; AVX1-LABEL: splat_v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splat_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
new file mode 100644
index 0000000..ded8232
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -0,0 +1,1931 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000010:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000010:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000200:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000200:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00003000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00003000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00040000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00040000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00500000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00500000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_06000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_06000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_70000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_70000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: movl $7, %eax
+; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01014545:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00112233:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00112233:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00001111:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_00001111:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_81a3c5e7:
+; ALL: # BB#0:
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08080808:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_08080808:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_08084c4c:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_8823cc67:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_9832dc76:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_9810dc54:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_08194c5d:
+; ALL: # BB#0:
+; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_2a3b6e7f:
+; ALL: # BB#0:
+; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08192a3b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_08192a3b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08991abb:
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_08991abb:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_091b2d3f:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_091b2d3f:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_09ab1def:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_09ab1def:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00014445:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00204464:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_03004744:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10005444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_22006644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_33307774:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_32107654:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00234467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00224466:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10325476:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_11335577:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10235467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10225466:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00015444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00204644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_03004474:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10004444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_22006446:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_33307474:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_32104567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00236744:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00226644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10324567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_11334567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01235467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01235466:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_002u6u44:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00uu66uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_103245uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1133uu67:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_0uu354uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuu3uu66:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_c348cda0:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_c348cda0:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_f511235a:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_f511235a:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_32103210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_32103210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_76547654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_76547654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_76543210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_76543210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_3210ba98:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_3210ba98:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_3210fedc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_3210fedc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_7654fedc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_7654fedc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_fedc7654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_fedc7654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_ba987654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_ba987654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_ba983210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_ba983210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000010:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000010:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000200:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000200:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00003000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00003000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00040000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00040000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00500000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00500000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_06000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_06000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_70000000:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_70000000:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: movl $7, %eax
+; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01014545:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_01014545:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00112233:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00112233:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00001111:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00001111:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_81a3c5e7:
+; AVX1: # BB#0:
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_81a3c5e7:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08080808:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_08080808:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08084c4c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_08084c4c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_8823cc67:
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_8823cc67:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9832dc76:
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_9832dc76:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9810dc54:
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_9810dc54:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08194c5d:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_08194c5d:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_2a3b6e7f:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_2a3b6e7f:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08192a3b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_08192a3b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08991abb:
+; AVX1: # BB#0:
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_08991abb:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_091b2d3f:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_091b2d3f:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_09ab1def:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_09ab1def:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00014445:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00014445:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00204464:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00204464:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_03004744:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_03004744:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10005444:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_10005444:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_22006644:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_22006644:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_33307774:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_33307774:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32107654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_32107654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00234467:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00234467:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00224466:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00224466:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10325476:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_10325476:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_11335577:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_11335577:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10235467:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_10235467:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10225466:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_10225466:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00015444:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00015444:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00204644:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00204644:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_03004474:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_03004474:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10004444:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_10004444:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_22006446:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_22006446:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_33307474:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_33307474:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32104567:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_32104567:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00236744:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00236744:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00226644:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00226644:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10324567:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_10324567:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_11334567:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_11334567:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01235467:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_01235467:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01235466:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_01235466:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_002u6u44:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_002u6u44:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00uu66uu:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_00uu66uu:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_103245uu:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_103245uu:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_1133uu67:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_1133uu67:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_0uu354uu:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_0uu354uu:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_6caa87e5:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u>
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32103210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_32103210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_76547654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_76547654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_76543210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_76543210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_3210ba98:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_3210ba98:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_3210fedc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_3210fedc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_7654fedc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_7654fedc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_fedc7654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_fedc7654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_ba987654:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_ba987654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_ba983210:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_ba983210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x float> @splat_mem_v8f32_2(float* %p) {
+; ALL-LABEL: splat_mem_v8f32_2:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastss (%rdi), %ymm0
+; ALL-NEXT: retq
+ %1 = load float* %p
+ %2 = insertelement <4 x float> undef, float %1, i32 0
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %3
+}
+
+define <8 x float> @splat_v8f32(<4 x float> %r) {
+; AVX1-LABEL: splat_v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splat_v8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
new file mode 100644
index 0000000..8f87c7c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -0,0 +1,1429 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000000:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000010:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000200:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00003000:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00040000:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00500000:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_06000000:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_70000000:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01014545:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00112233:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00001111:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_81a3c5e7:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08080808:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08084c4c:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_8823cc67:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_9832dc76:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_9810dc54:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08194c5d:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08192a3b:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08991abb:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_091b2d3f:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_09ab1def:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00014445:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00204464:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_03004744:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10005444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_22006644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_33307774:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_32107654:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00234467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00224466:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10325476:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_11335577:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10235467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10225466:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00015444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00204644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_03004474:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10004444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_22006446:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_33307474:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_32104567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00236744:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00226644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10324567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_11334567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01235467:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01235466:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_002u6u44:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00uu66uu:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_103245uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_1133uu67:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_0uu354uu:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_uuu3uu66:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_c348cda0:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm2[0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm4
+; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_f511235a:
+; ALL: # BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
+ ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000000:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000010:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000200:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00003000:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00040000:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00500000:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_06000000:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_70000000:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01014545:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00112233:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00001111:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_81a3c5e7:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08080808:
+; ALL: # BB#0:
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08084c4c:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_8823cc67:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_9832dc76:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_9810dc54:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,0]
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08194c5d:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_2a3b6e7f:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08192a3b:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08991abb:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_091b2d3f:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_09ab1def:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00014445:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00204464:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_03004744:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10005444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_22006644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_33307774:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_32107654:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00234467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00224466:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10325476:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_11335577:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10235467:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10225466:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00015444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00204644:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_03004474:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10004444:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_22006446:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_33307474:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_32104567:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00236744:
+; ALL: # BB#0:
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00226644:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10324567:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_11334567:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01235467:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01235466:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_002u6u44:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00uu66uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_103245uu:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_1133uu67:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_0uu354uu:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_uuu3uu66:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_6caa87e5:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
+ ret <8 x i64> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index e60ecb7..22a6749 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1,6 +1,14 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; Verify that the DAG combiner correctly folds bitwise operations across
+; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
+; basic and always-safe patterns. Also test that the DAG combiner will combine
+; target-specific shuffle instructions where reasonable.
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
@@ -8,57 +16,72 @@ declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd1
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: retq
- %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
- %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
+; ALL-LABEL: combine_pshufd1:
+; ALL: # BB#0: # %entry
+; ALL-NEXT: retq
+entry:
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+ %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
ret <4 x i32> %c
}
define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd2
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: retq
- %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+; ALL-LABEL: combine_pshufd2:
+; ALL: # BB#0: # %entry
+; ALL-NEXT: retq
+entry:
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
%b.cast = bitcast <4 x i32> %b to <8 x i16>
%c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
%c.cast = bitcast <8 x i16> %c to <4 x i32>
- %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
ret <4 x i32> %d
}
define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd3
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: retq
- %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+; ALL-LABEL: combine_pshufd3:
+; ALL: # BB#0: # %entry
+; ALL-NEXT: retq
+entry:
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
%b.cast = bitcast <4 x i32> %b to <8 x i16>
%c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
%c.cast = bitcast <8 x i16> %c to <4 x i32>
- %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
ret <4 x i32> %d
}
define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd4
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: retq
- %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
+; SSE-LABEL: combine_pshufd4:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufd4:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; AVX-NEXT: retq
+entry:
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
%b.cast = bitcast <4 x i32> %b to <8 x i16>
%c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
%c.cast = bitcast <8 x i16> %c to <4 x i32>
- %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
ret <4 x i32> %d
}
define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd5
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
- %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
+; SSE-LABEL: combine_pshufd5:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufd5:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT: retq
+entry:
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
%b.cast = bitcast <4 x i32> %b to <8 x i16>
%c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
%c.cast = bitcast <8 x i16> %c to <4 x i32>
@@ -67,53 +90,2458 @@ define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
}
define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd6
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufd $0
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: combine_pshufd6:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufd6:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: retq
+entry:
%b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
%c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
ret <4 x i32> %c
}
define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw1
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: retq
- %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
- %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+; ALL-LABEL: combine_pshuflw1:
+; ALL: # BB#0: # %entry
+; ALL-NEXT: retq
+entry:
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
ret <8 x i16> %c
}
define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw2
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: retq
+; ALL-LABEL: combine_pshuflw2:
+; ALL: # BB#0: # %entry
+; ALL-NEXT: retq
+entry:
%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
- %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
- %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
ret <8 x i16> %d
}
define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw3
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: combine_pshuflw3:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshuflw3:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; AVX-NEXT: retq
+entry:
%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
- %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
- %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
ret <8 x i16> %d
}
define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufhw1
-; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT: retq
+; SSE-LABEL: combine_pshufhw1:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufhw1:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT: retq
+entry:
%b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
- %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
- %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
ret <8 x i16> %d
}
+define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test1:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+ %and = and <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test2:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test3:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test3:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+ %xor = xor <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test4:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test4:
+; AVX: # BB#0:
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+ %and = and <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test5:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test5:
+; AVX: # BB#0:
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test6:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test6:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+ %xor = xor <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %xor
+}
+
+
+; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
+; are not performing a swizzle operations.
+
+define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test1b:
+; SSE2: # BB#0:
+; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test1b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: andps %xmm1, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test1b:
+; SSE41: # BB#0:
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test1b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test1b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %and = and <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test2b:
+; SSE2: # BB#0:
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test2b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test2b:
+; SSE41: # BB#0:
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test2b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test2b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test3b:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm0
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3b:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test3b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test3b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %xor = xor <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test4b:
+; SSE2: # BB#0:
+; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test4b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: andps %xmm1, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test4b:
+; SSE41: # BB#0:
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test4b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test4b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %and = and <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test5b:
+; SSE2: # BB#0:
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test5b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: orps %xmm1, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test5b:
+; SSE41: # BB#0:
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test5b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test5b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test6b:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test6b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm0
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test6b:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test6b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test6b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+ %xor = xor <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test1c:
+; SSE: # BB#0:
+; SSE-NEXT: andps %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test1c:
+; AVX: # BB#0:
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %and = and <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test2c:
+; SSE: # BB#0:
+; SSE-NEXT: orps %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test2c:
+; AVX: # BB#0:
+; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test3c:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm1, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test3c:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %xor = xor <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test4c:
+; SSE: # BB#0:
+; SSE-NEXT: andps %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test4c:
+; AVX: # BB#0:
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %and = and <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test5c:
+; SSE: # BB#0:
+; SSE-NEXT: orps %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test5c:
+; AVX: # BB#0:
+; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test6c:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm1, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_bitwise_ops_test6c:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
+; AVX-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+ %xor = xor <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test1:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test2:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test3:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test3:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test4:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_nested_undef_test4:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_nested_undef_test4:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test5:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test5:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test6:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test6:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test7:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test7:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test8:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test8:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test9:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test9:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test10:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test10:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test11:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test11:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test12:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_nested_undef_test12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_nested_undef_test12:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
+ ret <4 x i32> %2
+}
+
+; The following pair of shuffles is folded into vector %A.
+define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
+; ALL-LABEL: combine_nested_undef_test13:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
+ ret <4 x i32> %2
+}
+
+; The following pair of shuffles is folded into vector %B.
+define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test14:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test14:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
+ ret <4 x i32> %2
+}
+
+
+; Verify that we don't optimize the following cases. We expect more than one shuffle.
+;
+; FIXME: Many of these already don't make sense, and the rest should stop
+; making sense with th enew vector shuffle lowering. Revisit at least testing for
+; it.
+
+define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test15:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test15:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: combine_nested_undef_test16:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_nested_undef_test16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_nested_undef_test16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_nested_undef_test16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_nested_undef_test16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test17:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test17:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test18:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test18:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test19:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test19:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test20:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test20:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test21:
+; SSE: # BB#0:
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test21:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+
+; Test that we correctly combine shuffles according to rule
+; shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
+
+define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test22:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test22:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test23:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test23:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test24:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test24:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test25:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_nested_undef_test25:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_nested_undef_test25:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test26:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test26:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test27:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_nested_undef_test27:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_nested_undef_test27:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test28:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_nested_undef_test28:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
+ ret <4 x i32> %2
+}
+
+define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test1:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test1:
+; SSE41: # BB#0:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test2:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test2:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test2:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test3:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test3:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test4:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test4:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test5:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test5:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movaps %xmm1, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test5:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test5:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test6:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test6:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test6:
+; SSE41: # BB#0:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test6:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test7:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test7:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_test7:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test7:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test8:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test8:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test9:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test9:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test10:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test10:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movaps %xmm1, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test10:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_test10:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test10:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x i32> %2
+}
+
+define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
+; ALL-LABEL: combine_test11:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test12:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test12:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test12:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test12:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test13:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test13:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test14:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test14:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test15:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test15:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movaps %xmm0, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test15:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test15:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x float> %2
+}
+
+define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
+; ALL-LABEL: combine_test16:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test17:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test17:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test17:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_test17:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test17:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test18:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test18:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test19:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test19:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test20:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test20:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movaps %xmm0, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test20:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_test20:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test20:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x i32> %2
+}
+
+
+; Check some negative cases.
+; FIXME: Do any of these really make sense? Are they redundant with the above tests?
+
+define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test1b:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test1b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test1b:
+; SSE41: # BB#0:
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test1b:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test2b:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test2b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3-NEXT: movapd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test2b:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test2b:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test3b:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test3b:
+; AVX: # BB#0:
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test4b:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test4b:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test4b:
+; SSE41: # BB#0:
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test4b:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
+ ret <4 x float> %2
+}
+
+
+; Verify that we correctly fold shuffles even when we use illegal vector types.
+
+define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test1c:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd (%rdi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd (%rsi), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test1c:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd (%rdi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd (%rsi), %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movss %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test1c:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT: pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_test1c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0
+; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+ %A = load <4 x i8>* %a
+ %B = load <4 x i8>* %b
+ %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test2c:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd (%rdi), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movd (%rsi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test2c:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd (%rdi), %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movd (%rsi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test2c:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbd (%rdi), %xmm0
+; SSE41-NEXT: pmovzxbd (%rsi), %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test2c:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd (%rdi), %xmm0
+; AVX-NEXT: vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %A = load <4 x i8>* %a
+ %B = load <4 x i8>* %b
+ %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
+ %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+ ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test3c:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd (%rdi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd (%rsi), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test3c:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd (%rdi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd (%rsi), %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test3c:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT: pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_test3c:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd (%rdi), %xmm0
+; AVX-NEXT: vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %A = load <4 x i8>* %a
+ %B = load <4 x i8>* %b
+ %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test4c:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd (%rdi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd (%rsi), %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_test4c:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd (%rdi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd (%rsi), %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_test4c:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT: pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: combine_test4c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0
+; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT: retq
+ %A = load <4 x i8>* %a
+ %B = load <4 x i8>* %b
+ %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x i8> %2
+}
+
+
+; The following test cases are generated from this C++ code
+;
+;__m128 blend_01(__m128 a, __m128 b)
+;{
+; __m128 s = a;
+; s = _mm_blend_ps( s, b, 1<<0 );
+; s = _mm_blend_ps( s, b, 1<<1 );
+; return s;
+;}
+;
+;__m128 blend_02(__m128 a, __m128 b)
+;{
+; __m128 s = a;
+; s = _mm_blend_ps( s, b, 1<<0 );
+; s = _mm_blend_ps( s, b, 1<<2 );
+; return s;
+;}
+;
+;__m128 blend_123(__m128 a, __m128 b)
+;{
+; __m128 s = a;
+; s = _mm_blend_ps( s, b, 1<<1 );
+; s = _mm_blend_ps( s, b, 1<<2 );
+; s = _mm_blend_ps( s, b, 1<<3 );
+; return s;
+;}
+
+; Ideally, we should collapse the following shuffles into a single one.
+
+define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_01:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_blend_01:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_blend_01:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_blend_01:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
+ %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x float> %shuffle6
+}
+
+define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_02:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_blend_02:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movss %xmm1, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_blend_02:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_blend_02:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
+ %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x float> %shuffle6
+}
+
+define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_123:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_blend_123:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movss %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_blend_123:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_blend_123:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
+ %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+ %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %shuffle12
+}
+
+define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_1:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test_movhl_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_2:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test_movhl_2:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_3:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test_movhl_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
+ %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
+ ret <4 x i32> %2
+}
+
+
+; Verify that we fold shuffles according to rule:
+; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
+
+define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test1:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test1:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test2:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test3:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test3:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test4:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test4:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test5:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test5:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test5:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test5:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+
+; Verify that we fold shuffles according to rule:
+; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
+
+define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test6:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test7:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test7:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test7:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test8:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test8:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
+; SSE-LABEL: combine_undef_input_test9:
+; SSE: # BB#0:
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test9:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test10:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+ %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test11:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test11:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test11:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test11:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+ %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test12:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test12:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test13:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test13:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test14:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test14:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test15:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test15:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test15:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test15:
+; AVX: # BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+ %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+ ret <4 x float> %2
+}
+
+
+; Verify that shuffles are canonicalized according to rules:
+; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+;
+; This allows to trigger the following combine rule:
+; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
+;
+; As a result, all the shuffle pairs in each function below should be
+; combined into a single legal shuffle operation.
+
+define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test16:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+ %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test17:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test17:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test17:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test17:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test18:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_undef_input_test18:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_undef_input_test18:
+; SSE41: # BB#0:
+; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test18:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+ %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
+; SSE-LABEL: combine_undef_input_test19:
+; SSE: # BB#0:
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_undef_input_test19:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+ %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test20:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+ %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+ ret <4 x float> %2
+}
+
+; These tests are designed to test the ability to combine away unnecessary
+; operations feeding into a shuffle. The AVX cases are the important ones as
+; they leverage operations which cannot be done naturally on the entire vector
+; and thus are decomposed into multiple smaller operations.
+
+define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
+; SSE-LABEL: combine_unneeded_subvector1:
+; SSE: # BB#0:
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_unneeded_subvector1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_unneeded_subvector1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i32> %c
+}
+
+define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: combine_unneeded_subvector2:
+; SSE: # BB#0:
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_unneeded_subvector2:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_unneeded_subvector2:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: retq
+ %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+ ret <8 x i32> %d
+}
+
+define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps1:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_insertps1:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
+; AVX-NEXT: retq
+
+ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
+ %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+ ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps2:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_insertps2:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
+; AVX-NEXT: retq
+
+ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
+ %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+ ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps3:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_insertps3:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: retq
+
+ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
+ %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
+ ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps4:
+; SSE41: # BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_insertps4:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: retq
+
+ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
+ %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
+ ret <4 x float> %d
+}
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
new file mode 100644
index 0000000..226deb0
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -0,0 +1,235 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=SSE1
+
+target triple = "x86_64-unknown-unknown"
+
+define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0001:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0020:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0300:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_1000:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2200:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_3330:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_3210:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0011:
+; SSE1: # BB#0:
+; SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2233:
+; SSE1: # BB#0:
+; SSE1-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0022:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_1133:
+; SSE1: # BB#0:
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_4zzz:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: movss %xmm0, %xmm1
+; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_z4zz:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zz4z:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zuu4:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zzz7:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_z6zz:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE1-LABEL: insert_reg_and_zero_v4f32:
+; SSE1: # BB#0:
+; SSE1-NEXT: xorps %xmm1, %xmm1
+; SSE1-NEXT: movss %xmm0, %xmm1
+; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: retq
+ %v = insertelement <4 x float> undef, float %a, i32 0
+ %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; SSE1-LABEL: insert_mem_and_zero_v4f32:
+; SSE1: # BB#0:
+; SSE1-NEXT: movss (%rdi), %xmm0
+; SSE1-NEXT: retq
+ %a = load float* %ptr
+ %v = insertelement <4 x float> undef, float %a, i32 0
+ %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE1-LABEL: insert_mem_lo_v4f32:
+; SSE1: # BB#0:
+; SSE1-NEXT: movq (%rdi), %rax
+; SSE1-NEXT: movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT: shrq $32, %rax
+; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm1
+; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE1-NEXT: xorps %xmm2, %xmm2
+; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: retq
+ %a = load <2 x float>* %ptr
+ %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE1-LABEL: insert_mem_hi_v4f32:
+; SSE1: # BB#0:
+; SSE1-NEXT: movq (%rdi), %rax
+; SSE1-NEXT: movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT: shrq $32, %rax
+; SSE1-NEXT: movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm1
+; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE1-NEXT: xorps %xmm2, %xmm2
+; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT: retq
+ %a = load <2 x float>* %ptr
+ %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
+; SSE1-LABEL: shuffle_mem_v4f32_3210:
+; SSE1: # BB#0:
+; SSE1-NEXT: movaps (%rdi), %xmm0
+; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE1-NEXT: retq
+ %a = load <4 x float>* %ptr
+ %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
new file mode 100644
index 0000000..afd7a24
--- /dev/null
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -0,0 +1,206 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: zext_8i16_to_8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pand %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: zext_8i16_to_8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: zext_8i16_to_8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zext_8i16_to_8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %B = zext <8 x i16> %A to <8 x i32>
+ ret <8 x i32>%B
+}
+
+define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_4i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: zext_4i32_to_4i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: zext_4i32_to_4i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: zext_4i32_to_4i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zext_4i32_to_4i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxdq %xmm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %B = zext <4 x i32> %A to <4 x i64>
+ ret <4 x i64>%B
+}
+
+define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
+; SSE2-LABEL: zext_8i8_to_8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: zext_8i8_to_8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pand %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: zext_8i8_to_8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: zext_8i8_to_8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovzxwd %xmm0, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zext_8i8_to_8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %t = zext <8 x i8> %z to <8 x i32>
+ ret <8 x i32> %t
+}
+
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
+; SSE2-LABEL: zext_16i8_to_16i16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: zext_16i8_to_16i16:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT: pand %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: zext_16i8_to_16i16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxbw %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: zext_16i8_to_16i16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zext_16i8_to_16i16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbw %xmm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %t = zext <16 x i8> %z to <16 x i16>
+ ret <16 x i16> %t
+}
diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll
new file mode 100644
index 0000000..1e52654
--- /dev/null
+++ b/test/CodeGen/X86/vectorcall.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+
+; Test integer arguments.
+
+define x86_vectorcallcc i32 @test_int_1() {
+ ret i32 0
+}
+
+; CHECK-LABEL: {{^}}test_int_1@@0:
+; CHECK: xorl %eax, %eax
+
+define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
+ ret i32 %a
+}
+
+; X86-LABEL: {{^}}test_int_2@@4:
+; X64-LABEL: {{^}}test_int_2@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) {
+ %at = trunc i64 %a to i32
+ ret i32 %at
+}
+
+; X86-LABEL: {{^}}test_int_3@@8:
+; X64-LABEL: {{^}}test_int_3@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
+ %s = add i32 %a, %b
+ ret i32 %s
+}
+
+; X86-LABEL: {{^}}test_int_4@@8:
+; X86: leal (%ecx,%edx), %eax
+
+; X64-LABEL: {{^}}test_int_4@@16:
+; X64: leal (%rcx,%rdx), %eax
+
+define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) {
+ ret i32 0
+}
+; CHECK-LABEL: {{^}}test_int_5:
+
+define x86_vectorcallcc double @test_fp_1(double %a, double %b) {
+ ret double %b
+}
+; CHECK-LABEL: {{^}}test_fp_1@@16:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc double @test_fp_2(
+ double, double, double, double, double, double, double %r) {
+ ret double %r
+}
+; CHECK-LABEL: {{^}}test_fp_2@@56:
+; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0
+
+define x86_vectorcallcc {double, double, double, double} @test_fp_3() {
+ ret {double, double, double, double}
+ { double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_3@@0:
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+; FIXME: Returning via x87 isn't compatible, but its hard to structure the
+; tablegen any other way.
+define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() {
+ ret {double, double, double, double, double}
+ { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_4@@0:
+; CHECK: fldz
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) {
+ ret <16 x i8> %b
+}
+; CHECK-LABEL: {{^}}test_vec_1@@32:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc <16 x i8> @test_vec_2(
+ double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) {
+ ret <16 x i8> %r
+}
+; CHECK-LABEL: {{^}}test_vec_2@@104:
+; CHECK: movaps (%{{[re]}}cx), %xmm0
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
new file mode 100644
index 0000000..0c0f4bb
--- /dev/null
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -o - -mattr=+avx | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true>
+; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought
+; we would lower that into a blend where only the high bit is relevant.
+; However, since the whole mask is constant, this is simplified incorrectly
+; by the generic code, because it was expecting -1 in place of 2147483648.
+;
+; The problem does not occur without AVX, because vselect of v4i32 is not legal
+; nor custom.
+;
+; <rdar://problem/18675020>
+
+; CHECK-LABEL: test:
+; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535]
+; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807]
+; CHECK: ret
+define void @test(<4 x i16>* %a, <4 x i16>* %b) {
+body:
+ %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
+ %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+ store <4 x i16> %predphi, <4 x i16>* %a, align 8
+ store <4 x i16> %predphi42, <4 x i16>* %b, align 8
+ ret void
+}
+
+; Improve code coverage.
+;
+; When shrinking the condition used into the select to match a blend, this
+; test case exercises the path where the modified node is not the root
+; of the condition.
+;
+; CHECK-LABEL: test2:
+; CHECK: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
+; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
+; CHECK: vblendvpd [[MASK]]
+; CHECK: retq
+define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+bb:
+ %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198
+ %tmp1888 = load double** %arrayidx1928, align 8
+ %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+ %tmp1900 = bitcast double* %tmp1888 to <4 x double>*
+ store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8
+ ret void
+}
+
+; For this test, we used to optimized the conditional mask for the blend, i.e.,
+; we shrunk some of its bits.
+; However, this same mask was used in another select (%predphi31) that turned out
+; to be optimized into a and. In that case, the conditional mask was wrong.
+;
+; Make sure that the and is fed by the original mask.
+;
+; <rdar://problem/18819506>
+
+; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
+; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
+; even a faulty pattern would pass!
+;
+; CHECK-LABEL: test3:
+; Compute the original mask.
+; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
+; Shrink the bit of the mask.
+; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
+; Use the shrunk mask in the blend.
+; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Use the original mask in the and.
+; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}}
+; CHECK: retq
+define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
+ %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
+ %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
+ %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
+ %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+
+ store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8
+ store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 42cf06a..3bd1dc4 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -3,270 +3,253 @@
; Verify that we don't emit packed vector shifts instructions if the
; condition used by the vector select is a vector of constants.
-
define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test1
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test2
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test3
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test4
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: movaps %xmm1, %xmm0
-; CHECK: ret
-
define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test5
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test6:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [0,65535,0,65535,0,65535,0,65535]
+; CHECK-NEXT: andps %xmm0, %xmm1
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
ret <8 x i16> %1
}
-; CHECK-LABEL: test6
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test7:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test7
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test8:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test8
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test9:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test9
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test10:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test10
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test11:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u>
+; CHECK-NEXT: andps %xmm2, %xmm0
+; CHECK-NEXT: andnps %xmm1, %xmm2
+; CHECK-NEXT: orps %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test11
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test12:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test12
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test13:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test13
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
-
define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test14:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test14
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: pcmpeq
-; CHECK: ret
define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test15:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test15
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: pcmpeq
-; CHECK: ret
; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
-
define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
-}
-; CHECK-LABEL: test16
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: ret
+}
define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test17:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
}
-; CHECK-LABEL: test17
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: ret
define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test18:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test18
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test19:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %1
}
-; CHECK-LABEL: test19
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test20:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
ret <2 x double> %1
}
-; CHECK-LABEL: test20
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test21:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %1
}
-; CHECK-LABEL: test21
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test22:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
-; CHECK-LABEL: test22
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test23:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %1
}
-; CHECK-LABEL: test23
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test24:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
ret <2 x double> %1
}
-; CHECK-LABEL: test24
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test25:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %1
}
-; CHECK-LABEL: test25
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
-; CHECK-LABEL: select_of_shuffles_0
-; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]]
-; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]]
-; CHECK: subps [[REGB]], [[REGA]]
+; CHECK-LABEL: select_of_shuffles_0:
+; CHECK: # BB#0:
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT: subps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
%3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1
@@ -276,3 +259,24 @@ define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x
%7 = fsub <4 x float> %3, %6
ret <4 x float> %7
}
+
+; PR20677
+define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select_illegal:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; CHECK-NEXT: movaps %xmm7, 112(%rdi)
+; CHECK-NEXT: movaps %xmm6, 96(%rdi)
+; CHECK-NEXT: movaps %xmm5, 80(%rdi)
+; CHECK-NEXT: movaps %xmm4, 64(%rdi)
+; CHECK-NEXT: movaps %xmm3, 48(%rdi)
+; CHECK-NEXT: movaps %xmm2, 32(%rdi)
+; CHECK-NEXT: movaps %xmm1, 16(%rdi)
+; CHECK-NEXT: movaps %xmm0, (%rdi)
+; CHECK-NEXT: retq
+ %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
+ ret <16 x double> %sel
+}
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index d115929..e0b861f 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -2,12 +2,12 @@
; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
; CHECK: movl
-; CHECK: paddd
+; CHECK: paddw
; CHECK: movlpd
; Scheduler causes produce a different instruction order
; ATOM: movl
-; ATOM: paddd
+; ATOM: paddw
; ATOM: movlpd
; bitcast a v4i16 to v2i32
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 9f6778c..3f54ab6 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: paddq
+; CHECK: paddd
; truncate v2i64 to v2i32
diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll
index 522ab47..8e5174f 100644
--- a/test/CodeGen/X86/widen_conversions.ll
+++ b/test/CodeGen/X86/widen_conversions.ll
@@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]]
; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
-; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: punpcklwd %[[Z]], %[[X]]
; CHECK-NEXT: ret
%val = load <4 x i8>* %ptr
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 41bea85..0ec3574 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -4,12 +4,12 @@
;
%i32vec3 = type <3 x i32>
-; CHECK: add3i32
define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
+; CHECK-LABEL: add3i32:
+; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
+; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}})
+; CHECK-NEXT: movq %[[R0]], (%{{.*}})
%a = load %i32vec3* %ap, align 16
%b = load %i32vec3* %bp, align 16
%x = add %i32vec3 %a, %b
@@ -17,15 +17,15 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
ret void
}
-; CHECK: add3i32_2
define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
-; CHECK: movq
-; CHECK: pinsrd
-; CHECK: movq
-; CHECK: pinsrd
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
+; CHECK-LABEL: add3i32_2:
+; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]]
+; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]]
+; CHECK-NEXT: paddd %[[R0]], %[[R1]]
+; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}})
+; CHECK-NEXT: movq %[[R1]], (%{{.*}})
%a = load %i32vec3* %ap, align 8
%b = load %i32vec3* %bp, align 8
%x = add %i32vec3 %a, %b
@@ -34,15 +34,15 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
}
%i32vec7 = type <7 x i32>
-; CHECK: add7i32
define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
-; CHECK: movdqa
+; CHECK-LABEL: add7i32:
+; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
+; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
+; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
+; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
+; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
%a = load %i32vec7* %ap, align 16
%b = load %i32vec7* %bp, align 16
%x = add %i32vec7 %a, %b
@@ -50,18 +50,18 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
ret void
}
-; CHECK: add12i32
%i32vec12 = type <12 x i32>
define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
+; CHECK-LABEL: add12i32:
+; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
+; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
+; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
+; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]]
+; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}})
+; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
+; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
%a = load %i32vec12* %ap, align 16
%b = load %i32vec12* %bp, align 16
%x = add %i32vec12 %a, %b
@@ -70,11 +70,17 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
}
-; CHECK: add3i16
%i16vec3 = type <3 x i16>
define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
-; CHECK: paddd
-; CHECK: ret
+; CHECK-LABEL: add3i16:
+; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: paddd %[[R0]], %[[R1]]
+; CHECK-NEXT: movdqa %[[R1]], %[[R0]]
+; CHECK-NEXT: pshufb {{.*}}, %[[R0]]
+; CHECK-NEXT: pmovzxdq %[[R0]], %[[R0]]
+; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
+; CHECK-NEXT: movd %[[R0]], (%{{.*}})
%a = load %i16vec3* %ap, align 16
%b = load %i16vec3* %bp, align 16
%x = add %i16vec3 %a, %b
@@ -82,11 +88,13 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
ret void
}
-; CHECK: add4i16
%i16vec4 = type <4 x i16>
define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
-; CHECK: paddd
-; CHECK: movq
+; CHECK-LABEL: add4i16:
+; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: paddw %[[R0]], %[[R1]]
+; CHECK-NEXT: movq %[[R1]], (%{{.*}})
%a = load %i16vec4* %ap, align 16
%b = load %i16vec4* %bp, align 16
%x = add %i16vec4 %a, %b
@@ -94,15 +102,15 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
ret void
}
-; CHECK: add12i16
%i16vec12 = type <12 x i16>
define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: movq
-; CHECK: movdqa
+; CHECK-LABEL: add12i16:
+; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
+; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
+; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
+; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
%a = load %i16vec12* %ap, align 16
%b = load %i16vec12* %bp, align 16
%x = add %i16vec12 %a, %b
@@ -110,18 +118,18 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
ret void
}
-; CHECK: add18i16
%i16vec18 = type <18 x i16>
define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: movd
-; CHECK: movdqa
-; CHECK: movdqa
+; CHECK-LABEL: add18i16:
+; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
+; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
+; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
+; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]]
+; CHECK-NEXT: movd %[[R2]], 32(%{{.*}})
+; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
+; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
%a = load %i16vec18* %ap, align 16
%b = load %i16vec18* %bp, align 16
%x = add %i16vec18 %a, %b
@@ -130,11 +138,18 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
}
-; CHECK: add3i8
%i8vec3 = type <3 x i8>
define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
-; CHECK: paddd
-; CHECK: ret
+; CHECK-LABEL: add3i8:
+; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: paddd %[[R0]], %[[R1]]
+; CHECK-NEXT: movdqa %[[R1]], %[[R0]]
+; CHECK-NEXT: pshufb {{.*}}, %[[R0]]
+; CHECK-NEXT: pmovzxwq %[[R0]], %[[R0]]
+; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}})
+; CHECK-NEXT: movd %[[R0]], %e[[R2:[abcd]]]x
+; CHECK-NEXT: movw %[[R2]]x, (%{{.*}})
%a = load %i8vec3* %ap, align 16
%b = load %i8vec3* %bp, align 16
%x = add %i8vec3 %a, %b
@@ -142,17 +157,18 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
ret void
}
-; CHECK-LABEL: add31i8:
%i8vec31 = type <31 x i8>
define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddb
-; CHECK: paddb
-; CHECK: pextrb
-; CHECK: pextrw
-; CHECK: movq
-; CHECK: ret
+; CHECK-LABEL: add31i8:
+; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT: paddb (%{{.*}}), %[[R0]]
+; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]]
+; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}})
+; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}})
+; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
+; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
+; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
%a = load %i8vec31* %ap, align 16
%b = load %i8vec31* %bp, align 16
%x = add %i8vec31 %a, %b
@@ -161,14 +177,43 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
}
-; CHECK: rot
%i8vec3pack = type { <3 x i8>, i8 }
-define %i8vec3pack @rot() nounwind {
-; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
+; CHECK-LABEL: rot:
+; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
+; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
+; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]]
+; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]]
+; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]])
+; CHECK-NEXT: movb $-98, 2(%[[PTR0]])
+; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
+; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]]
+; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]]
+; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x
+; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]])
+; CHECK-NEXT: movb $1, 2(%[[PTR1]])
+; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
+; CHECK-NEXT: pand {{.*}}, %[[X0]]
+; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT: shrl %e[[R0]]x
+; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x
+; CHECK-NEXT: shrl %e[[R1]]x
+; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]]
+; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT: shrl %e[[R0]]x
+; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT: movdqa %[[X1]], %[[X2:xmm[0-9]+]]
+; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X2]]
+; CHECK-NEXT: pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]]
+; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}})
+; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x
+; CHECK-NEXT: movw %[[R0]]x, (%{{.*}})
+
entry:
- %X = alloca %i8vec3pack, align 4
- %rot = alloca %i8vec3pack, align 4
- %result = alloca %i8vec3pack, align 4
%storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
%storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
@@ -180,7 +225,6 @@ entry:
%shr = lshr <3 x i8> %extractVec, %extractVec3
%storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
store <3 x i8> %shr, <3 x i8>* %storetmp4
- %tmp5 = load %i8vec3pack* %result
- ret %i8vec3pack %tmp5
+ ret void
}
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index a355b75..70fdbb7 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -1,43 +1,56 @@
; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
+target triple = "x86_64-unknown-unknown"
+
; widening shuffle v3float and then a add
define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
-entry:
; CHECK-LABEL: shuf:
-; CHECK: extractps
-; CHECK: extractps
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: addps %xmm1, %xmm0
+; CHECK-NEXT: extractps $2, %xmm0, 8(%eax)
+; CHECK-NEXT: extractps $1, %xmm0, 4(%eax)
+; CHECK-NEXT: movss %xmm0, (%eax)
+; CHECK-NEXT: retl
+entry:
%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2>
%val = fadd <3 x float> %x, %src2
store <3 x float> %val, <3 x float>* %dst.addr
ret void
-; CHECK: ret
}
; widening shuffle v3float with a different mask and then a add
define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
-entry:
; CHECK-LABEL: shuf2:
-; CHECK: extractps
-; CHECK: extractps
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; CHECK-NEXT: addps %xmm1, %xmm0
+; CHECK-NEXT: extractps $2, %xmm0, 8(%eax)
+; CHECK-NEXT: extractps $1, %xmm0, 4(%eax)
+; CHECK-NEXT: movss %xmm0, (%eax)
+; CHECK-NEXT: retl
+entry:
%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2>
%val = fadd <3 x float> %x, %src2
store <3 x float> %val, <3 x float>* %dst.addr
ret void
-; CHECK: ret
}
; Example of when widening a v3float operation causes the DAG to replace a node
; with the operation that we are currently widening, i.e. when replacing
; opA with opB, the DAG will produce new operations with opA.
define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
-entry:
; CHECK-LABEL: shuf3:
-; CHECK-NOT: movlhps
-; CHECK-NOT: shufps
-; CHECK: pshufd
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT: movaps %xmm1, (%eax)
+; CHECK-NEXT: retl
+entry:
%shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
- %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
%tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>>
%tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -45,27 +58,35 @@ entry:
%tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
%t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32>
%shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19>
- %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080>
+ %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080>
%shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst
ret void
-; CHECK: ret
}
; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS
define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; CHECK-LABEL: shuf4:
-; CHECK-NOT: punpckldq
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: pshufb %xmm2, %xmm1
+; CHECK-NEXT: pshufb %xmm2, %xmm0
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retl
%vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %vshuf
-; CHECK: ret
}
; PR11389: another CONCAT_VECTORS case
define void @shuf5(<8 x i8>* %p) nounwind {
; CHECK-LABEL: shuf5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u>
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: movlpd %xmm0, (%eax)
+; CHECK-NEXT: retl
%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i8> %v, <8 x i8>* %p, align 8
ret void
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
new file mode 100644
index 0000000..cabd36a
--- /dev/null
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
+
+; CHECK: calll L0$pb
+; CHECK-NEXT: L0$pb:
+; CHECK-NEXT: popl %eax
+; CHECK-NEXT: addl LJTI0_0(,%ecx,4), %eax
+; CHECK-NEXT: jmpl *%eax
+
+; CHECK: LJTI0_0:
+; CHECK-NEXT: .long LBB0_4-L0$pb
+; CHECK-NEXT: .long LBB0_5-L0$pb
+; CHECK-NEXT: .long LBB0_6-L0$pb
+; CHECK-NEXT: .long LBB0_7-L0$pb
+
+
+target triple = "i686--windows-itanium"
+define i32 @f(i64 %x) {
+bb0:
+ switch i64 %x, label %bb5 [
+ i64 1, label %bb1
+ i64 2, label %bb2
+ i64 3, label %bb3
+ i64 4, label %bb4
+ ]
+bb1:
+ br label %bb5
+bb2:
+ br label %bb5
+bb3:
+ br label %bb5
+bb4:
+ br label %bb5
+bb5:
+ %y = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ]
+ ret i32 %y
+}
diff --git a/test/CodeGen/X86/win64_call_epi.ll b/test/CodeGen/X86/win64_call_epi.ll
new file mode 100644
index 0000000..bc73ad4
--- /dev/null
+++ b/test/CodeGen/X86/win64_call_epi.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64
+
+declare void @bar()
+declare void @baz()
+declare i32 @personality(...)
+
+; Check for 'nop' between the last call and the epilogue.
+define void @foo1() {
+
+ invoke void @bar()
+ to label %normal
+ unwind label %catch
+
+normal:
+ ret void
+
+catch:
+ %1 = landingpad { i8*, i32 } personality i32 (...)* @personality cleanup
+ resume { i8*, i32 } %1
+}
+; WIN64-LABEL: foo1:
+; WIN64: .seh_proc foo1
+; WIN64: callq bar
+; WIN64: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
+; Check for 'ud2' after noreturn call
+; WIN64: callq _Unwind_Resume
+; WIN64-NEXT: ud2
+; WIN64: .seh_endproc
+
+
+; Check it still works when blocks are reordered.
+@something = global i32 0
+define void @foo2(i1 zeroext %cond ) {
+ br i1 %cond, label %a, label %b, !prof !0
+a:
+ call void @bar()
+ br label %done
+b:
+ call void @baz()
+ store i32 0, i32* @something
+ br label %done
+done:
+ ret void
+}
+!0 = metadata !{metadata !"branch_weights", i32 100, i32 0}
+; WIN64-LABEL: foo2:
+; WIN64: callq bar
+; WIN64: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
+
+
+; Check nop is not emitted when call is not adjacent to epilogue.
+define i32 @foo3() {
+ call void @bar()
+ ret i32 0
+}
+; WIN64-LABEL: foo3:
+; WIN64: callq bar
+; WIN64: xorl
+; WIN64-NOT: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index 1a51b2a..8d7f201 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll
@@ -111,3 +111,22 @@ entry:
%tmp = va_arg i8** %ap, i32
ret i32 %tmp
}
+
+define void @sret_arg(i32* sret %agg.result, i8* nocapture readnone %format, ...) {
+entry:
+ %ap = alloca i8*
+ %ap_i8 = bitcast i8** %ap to i8*
+ call void @llvm.va_start(i8* %ap_i8)
+ %tmp = va_arg i8** %ap, i32
+ store i32 %tmp, i32* %agg.result
+ ret void
+}
+; CHECK-LABEL: sret_arg:
+; CHECK: pushq
+; CHECK-DAG: movq %r9, 40(%rsp)
+; CHECK-DAG: movq %r8, 32(%rsp)
+; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]]
+; CHECK: movl %[[tmp]], (%[[sret:[^ ]*]])
+; CHECK: movq %[[sret]], %rax
+; CHECK: popq
+; CHECK: retq
diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
new file mode 100644
index 0000000..e8b853a
--- /dev/null
+++ b/test/CodeGen/X86/win_cst_pool.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define double @double() {
+ ret double 0x0000000000800000
+}
+; CHECK: .globl __real@0000000000800000
+; CHECK-NEXT: .section .rdata,"rd",discard,__real@0000000000800000
+; CHECK-NEXT: .align 8
+; CHECK-NEXT: __real@0000000000800000:
+; CHECK-NEXT: .quad 8388608
+; CHECK: double:
+; CHECK: movsd __real@0000000000800000(%rip), %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @vec1() {
+ ret <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+}
+; CHECK: .globl __xmm@00000000000000010000000200000003
+; CHECK-NEXT: .section .rdata,"rd",discard,__xmm@00000000000000010000000200000003
+; CHECK-NEXT: .align 16
+; CHECK-NEXT: __xmm@00000000000000010000000200000003:
+; CHECK-NEXT: .long 3
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 0
+; CHECK: vec1:
+; CHECK: movaps __xmm@00000000000000010000000200000003(%rip), %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @vec2() {
+ ret <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+}
+; CHECK: .globl __xmm@00000001000200030004000500060007
+; CHECK-NEXT: .section .rdata,"rd",discard,__xmm@00000001000200030004000500060007
+; CHECK-NEXT: .align 16
+; CHECK-NEXT: __xmm@00000001000200030004000500060007:
+; CHECK-NEXT: .short 7
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .short 5
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 3
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 0
+; CHECK: vec2:
+; CHECK: movaps __xmm@00000001000200030004000500060007(%rip), %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @undef1() {
+ ret <4 x float> <float 1.0, float 1.0, float undef, float undef>
+
+; CHECK: .globl __xmm@00000000000000003f8000003f800000
+; CHECK-NEXT: .section .rdata,"rd",discard,__xmm@00000000000000003f8000003f800000
+; CHECK-NEXT: .align 16
+; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
+; CHECK-NEXT: .long 1065353216 # float 1
+; CHECK-NEXT: .long 1065353216 # float 1
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .zero 4
+; CHECK: undef1:
+; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/windows-itanium-alloca.ll b/test/CodeGen/X86/windows-itanium-alloca.ll
new file mode 100644
index 0000000..0a06cde
--- /dev/null
+++ b/test/CodeGen/X86/windows-itanium-alloca.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686--windows-itanium"
+
+declare void @external(i8*)
+
+define dllexport void @alloca(i32 %sz) {
+entry:
+ %vla = alloca i8, i32 %sz, align 1
+ call void @external(i8* %vla)
+ ret void
+}
+
+; CHECK: __chkstk
+
diff --git a/test/CodeGen/X86/x32-function_pointer-1.ll b/test/CodeGen/X86/x32-function_pointer-1.ll
new file mode 100644
index 0000000..2baf92a
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-1.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test for x32 function pointer tail call
+
+@foo1 = external global void (i8*)*
+@foo2 = external global void (i8*)*
+
+define void @bar(i8* %h) nounwind uwtable {
+entry:
+ %0 = load void (i8*)** @foo1, align 4
+; CHECK: movl foo1(%rip), %e{{[^,]*}}
+ tail call void %0(i8* %h) nounwind
+; CHECK: callq *%r{{[^,]*}}
+ %1 = load void (i8*)** @foo2, align 4
+; CHECK: movl foo2(%rip), %e{{[^,]*}}
+ tail call void %1(i8* %h) nounwind
+; CHECK: jmpq *%r{{[^,]*}}
+ ret void
+}
diff --git a/test/CodeGen/X86/x32-function_pointer-2.ll b/test/CodeGen/X86/x32-function_pointer-2.ll
new file mode 100644
index 0000000..f727d41
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-2.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test call function pointer with function argument
+;
+; void bar (void * h, void (*foo) (void *))
+; {
+; foo (h);
+; foo (h);
+; }
+
+
+define void @bar(i8* %h, void (i8*)* nocapture %foo) nounwind {
+entry:
+ tail call void %foo(i8* %h) nounwind
+; CHECK: mov{{l|q}} %{{e|r}}si, %{{e|r}}[[REG:.*]]{{d?}}
+; CHECK: callq *%r[[REG]]
+ tail call void %foo(i8* %h) nounwind
+; CHECK: jmpq *%r{{[^,]*}}
+ ret void
+}
diff --git a/test/CodeGen/X86/x32-function_pointer-3.ll b/test/CodeGen/X86/x32-function_pointer-3.ll
new file mode 100644
index 0000000..5eaf85d
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-3.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test calling function pointer passed in struct
+
+; The fuction argument `h' in
+
+; struct foo {
+; void (*f) (void);
+; int i;
+; };
+; void
+; bar (struct foo h)
+; {
+; h.f ();
+; }
+
+; is passed in the 64-bit %rdi register. The `f' field is in the lower 32
+; bits of %rdi register and the `i' field is in the upper 32 bits of %rdi
+; register. We need to zero-extend %edi to %rdi before branching via %rdi.
+
+define void @bar(i64 %h.coerce) nounwind {
+entry:
+ %h.sroa.0.0.extract.trunc = trunc i64 %h.coerce to i32
+ %0 = inttoptr i32 %h.sroa.0.0.extract.trunc to void ()*
+; CHECK: movl %edi, %e[[REG:.*]]
+ tail call void %0() nounwind
+; CHECK: jmpq *%r[[REG]]
+ ret void
+}
diff --git a/test/CodeGen/X86/x86-64-call.ll b/test/CodeGen/X86/x86-64-call.ll
new file mode 100644
index 0000000..300f8d1
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-call.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-pc-linux -verify-machineinstrs | FileCheck %s -check-prefix=IA32
+
+; trivial test for correct call suffix
+
+define i32 @far() nounwind uwtable {
+entry:
+; CHECK: callq foo
+; IA32: calll foo
+ tail call void @foo() nounwind
+ ret i32 0
+}
+
+declare void @foo()
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index da8082b..8790fa6 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
; RUN: grep "callq g@PLT" %t1
-@g = alias weak i32 ()* @f
+@g = weak alias i32 ()* @f
define void @h() {
entry:
diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
new file mode 100644
index 0000000..c476ffd
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
+
+; x32 uses %esp, %ebp as stack and frame pointers
+
+; CHECK-LABEL: foo
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: movq %rdi, -8(%rbp)
+; CHECK: popq %rbp
+; X32ABI-LABEL: foo
+; X32ABI: pushq %rbp
+; X32ABI: movl %esp, %ebp
+; X32ABI: movl %edi, -4(%ebp)
+; X32ABI: popq %rbp
+; NACL-LABEL: foo
+; NACL: pushq %rbp
+; NACL: movq %rsp, %rbp
+; NACL: movl %edi, -4(%rbp)
+; NACL: popq %rbp
+
+
+define void @foo(i32* %a) #0 {
+entry:
+ %a.addr = alloca i32*, align 4
+ %b = alloca i32*, align 4
+ store i32* %a, i32** %a.addr, align 4
+ ret void
+}
+
+attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"}
+
+
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 641786f..2879fb4 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -1,10 +1,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
@tm_nest_level = internal thread_local global i32 0
define i64 @z() nounwind {
-; FIXME: The codegen here is primitive at best and could be much better.
-; The add and the moves can be folded together.
-; CHECK-DAG: movq $tm_nest_level@TPOFF, %rcx
-; CHECK-DAG: movq %fs:0, %rax
-; CHECK: addl %ecx, %eax
+; CHECK: movq $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
+; CHECK-NEXT: addl %fs:0, %e[[R0]]x
+; CHECK-NEXT: andq $100, %r[[R0]]x
+
ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
}
diff --git a/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll
new file mode 100644
index 0000000..fcf7eae
--- /dev/null
+++ b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 -mattr=+64bit,+sse2 < %s | FileCheck %s
+
+; DAGCombine may choose to rewrite 2 loads feeding a select as a select of
+; addresses feeding a load. This test ensures that when it does that it creates
+; a load with alignment equivalent to the most restrictive source load.
+
+declare void @sink(<2 x double>)
+
+define void @test1(i1 %cmp) align 2 {
+ %1 = alloca <2 x double>, align 16
+ %2 = alloca <2 x double>, align 8
+
+ %val = load <2 x double>* %1, align 16
+ %val2 = load <2 x double>* %2, align 8
+ %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2
+ call void @sink(<2 x double> %val3)
+ ret void
+ ; CHECK: test1
+ ; CHECK: movups
+ ; CHECK: ret
+}
+
+define void @test2(i1 %cmp) align 2 {
+ %1 = alloca <2 x double>, align 16
+ %2 = alloca <2 x double>, align 8
+
+ %val = load <2 x double>* %1, align 16
+ %val2 = load <2 x double>* %2, align 16
+ %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2
+ call void @sink(<2 x double> %val3)
+ ret void
+ ; CHECK: test2
+ ; CHECK: movaps
+ ; CHECK: ret
+}
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
new file mode 100644
index 0000000..4317d8a
--- /dev/null
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 1065353216 ## 0x3f800000
+; CHECK-NEXT: .long 1065353216 ## 0x3f800000
+; CHECK-NEXT: .long 1065353216 ## 0x3f800000
+; CHECK-NEXT: .long 1065353216 ## 0x3f800000
+; CHECK-LABEL: foo:
+; CHECK: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0
+; CHECK-NEXT: retq
+
+ %cmp = fcmp oeq <4 x float> %val, %test
+ %ext = zext <4 x i1> %cmp to <4 x i32>
+ %result = sitofp <4 x i32> %ext to <4 x float>
+ ret <4 x float> %result
+}
+
+; Make sure the operation doesn't try to get folded when the sizes don't match,
+; as that ends up crashing later when trying to form a bitcast operation for
+; the folded nodes.
+define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
+; CHECK-LABEL: LCPI1_0:
+; CHECK-NEXT: .long 1 ## 0x1
+; CHECK-NEXT: .long 1 ## 0x1
+; CHECK-NEXT: .long 1 ## 0x1
+; CHECK-NEXT: .long 1 ## 0x1
+; CHECK-LABEL: foo1:
+; FIXME: The operation gets scalarized. If/when the compiler learns to better
+; use [V]CVTDQ2PD, this will need updated.
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+ %cmp = fcmp oeq <4 x float> %val, %test
+ %ext = zext <4 x i1> %cmp to <4 x i32>
+ %result = sitofp <4 x i32> %ext to <4 x double>
+ store <4 x double> %result, <4 x double>* %p
+ ret void
+}
+
+; Also test the general purpose constant folding of int->fp.
+define void @foo2(<4 x float>* noalias %result) nounwind {
+; CHECK-LABEL: LCPI2_0:
+; CHECK-NEXT: .long 1082130432 ## float 4.000000e+00
+; CHECK-NEXT: .long 1084227584 ## float 5.000000e+00
+; CHECK-NEXT: .long 1086324736 ## float 6.000000e+00
+; CHECK-NEXT: .long 1088421888 ## float 7.000000e+00
+; CHECK-LABEL: foo2:
+; CHECK: movaps LCPI2_0(%rip), %xmm0
+
+ %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>
+ store <4 x float> %val, <4 x float>* %result
+ ret void
+}
+
+; Fold explicit AND operations when the constant isn't a splat of a single
+; scalar value like what the zext creates.
+define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: LCPI3_0:
+; CHECK-NEXT: .long 1065353216 ## 0x3f800000
+; CHECK-NEXT: .long 0 ## 0x0
+; CHECK-NEXT: .long 1065353216 ## 0x3f800000
+; CHECK-NEXT: .long 0 ## 0x0
+; CHECK-LABEL: foo3:
+; CHECK: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0
+ %cmp = fcmp oeq <4 x float> %val, %test
+ %ext = zext <4 x i1> %cmp to <4 x i32>
+ %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
+ %result = sitofp <4 x i32> %and to <4 x float>
+ ret <4 x float> %result
+}
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index f078631..54a4d6aa 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=DAG
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
-; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
;
; Get the actual value of the overflow bit.
@@ -9,12 +7,9 @@
; SADDO reg, reg
define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
entry:
-; DAG-LABEL: saddo.i8
-; DAG: addb %sil, %dil
-; DAG-NEXT: seto %al
-; FAST-LABEL: saddo.i8
-; FAST: addb %sil, %dil
-; FAST-NEXT: seto %al
+; CHECK-LABEL: saddo.i8
+; CHECK: addb %sil, %dil
+; CHECK-NEXT: seto %al
%t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
%val = extractvalue {i8, i1} %t, 0
%obit = extractvalue {i8, i1} %t, 1
@@ -24,12 +19,9 @@ entry:
define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
entry:
-; DAG-LABEL: saddo.i16
-; DAG: addw %si, %di
-; DAG-NEXT: seto %al
-; FAST-LABEL: saddo.i16
-; FAST: addw %si, %di
-; FAST-NEXT: seto %al
+; CHECK-LABEL: saddo.i16
+; CHECK: addw %si, %di
+; CHECK-NEXT: seto %al
%t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
%val = extractvalue {i16, i1} %t, 0
%obit = extractvalue {i16, i1} %t, 1
@@ -39,12 +31,9 @@ entry:
define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
-; DAG-LABEL: saddo.i32
-; DAG: addl %esi, %edi
-; DAG-NEXT: seto %al
-; FAST-LABEL: saddo.i32
-; FAST: addl %esi, %edi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: saddo.i32
+; CHECK: addl %esi, %edi
+; CHECK-NEXT: seto %al
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -54,12 +43,9 @@ entry:
define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
-; DAG-LABEL: saddo.i64
-; DAG: addq %rsi, %rdi
-; DAG-NEXT: seto %al
-; FAST-LABEL: saddo.i64
-; FAST: addq %rsi, %rdi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: saddo.i64
+; CHECK: addq %rsi, %rdi
+; CHECK-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -67,16 +53,48 @@ entry:
ret i1 %obit
}
-; SADDO reg, imm | imm, reg
-; FIXME: INC isn't supported in FastISel yet
-define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
+; SADDO reg, 1 | INC
+define zeroext i1 @saddo.inc.i8(i8 %v1, i8* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i8
+; CHECK: incb %dil
+; CHECK-NEXT: seto %al
+ %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1)
+ %val = extractvalue {i8, i1} %t, 0
+ %obit = extractvalue {i8, i1} %t, 1
+ store i8 %val, i8* %res
+ ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i16(i16 %v1, i16* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i16
+; CHECK: incw %di
+; CHECK-NEXT: seto %al
+ %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1)
+ %val = extractvalue {i16, i1} %t, 0
+ %obit = extractvalue {i16, i1} %t, 1
+ store i16 %val, i16* %res
+ ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i32(i32 %v1, i32* %res) {
entry:
-; DAG-LABEL: saddo.i64imm1
-; DAG: incq %rdi
-; DAG-NEXT: seto %al
-; FAST-LABEL: saddo.i64imm1
-; FAST: addq $1, %rdi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: saddo.inc.i32
+; CHECK: incl %edi
+; CHECK-NEXT: seto %al
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i64
+; CHECK: incq %rdi
+; CHECK-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -84,17 +102,18 @@ entry:
ret i1 %obit
}
+; SADDO reg, imm | imm, reg
; FIXME: DAG doesn't optimize immediates on the LHS.
-define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
entry:
-; DAG-LABEL: saddo.i64imm2
-; DAG: mov
-; DAG-NEXT: addq
-; DAG-NEXT: seto
-; FAST-LABEL: saddo.i64imm2
-; FAST: addq $1, %rdi
-; FAST-NEXT: seto %al
- %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
+; SDAG-LABEL: saddo.i64imm1
+; SDAG: mov
+; SDAG-NEXT: addq
+; SDAG-NEXT: seto
+; FAST-LABEL: saddo.i64imm1
+; FAST: addq $2, %rdi
+; FAST-NEXT: seto %al
+ %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
@@ -102,14 +121,11 @@ entry:
}
; Check boundary conditions for large immediates.
-define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
entry:
-; DAG-LABEL: saddo.i64imm3
-; DAG: addq $-2147483648, %rdi
-; DAG-NEXT: seto %al
-; FAST-LABEL: saddo.i64imm3
-; FAST: addq $-2147483648, %rdi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: saddo.i64imm2
+; CHECK: addq $-2147483648, %rdi
+; CHECK-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -117,16 +133,12 @@ entry:
ret i1 %obit
}
-define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
entry:
-; DAG-LABEL: saddo.i64imm4
-; DAG: movabsq $-21474836489, %[[REG:[a-z]+]]
-; DAG-NEXT: addq %rdi, %[[REG]]
-; DAG-NEXT: seto
-; FAST-LABEL: saddo.i64imm4
-; FAST: movabsq $-21474836489, %[[REG:[a-z]+]]
-; FAST-NEXT: addq %rdi, %[[REG]]
-; FAST-NEXT: seto
+; CHECK-LABEL: saddo.i64imm3
+; CHECK: movabsq $-21474836489, %[[REG:[a-z]+]]
+; CHECK-NEXT: addq %rdi, %[[REG]]
+; CHECK-NEXT: seto
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -134,14 +146,11 @@ entry:
ret i1 %obit
}
-define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
entry:
-; DAG-LABEL: saddo.i64imm5
-; DAG: addq $2147483647, %rdi
-; DAG-NEXT: seto
-; FAST-LABEL: saddo.i64imm5
-; FAST: addq $2147483647, %rdi
-; FAST-NEXT: seto
+; CHECK-LABEL: saddo.i64imm4
+; CHECK: addq $2147483647, %rdi
+; CHECK-NEXT: seto
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -149,17 +158,12 @@ entry:
ret i1 %obit
}
-; TODO: FastISel shouldn't use movabsq.
-define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
entry:
-; DAG-LABEL: saddo.i64imm6
-; DAG: movl $2147483648, %ecx
-; DAG: addq %rdi, %rcx
-; DAG-NEXT: seto
-; FAST-LABEL: saddo.i64imm6
-; FAST: movabsq $2147483648, %[[REG:[a-z]+]]
-; FAST: addq %rdi, %[[REG]]
-; FAST-NEXT: seto
+; CHECK-LABEL: saddo.i64imm5
+; CHECK: movl $2147483648
+; CHECK: addq %rdi
+; CHECK-NEXT: seto
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -170,12 +174,9 @@ entry:
; UADDO
define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
-; DAG-LABEL: uaddo.i32
-; DAG: addl %esi, %edi
-; DAG-NEXT: setb %al
-; FAST-LABEL: uaddo.i32
-; FAST: addl %esi, %edi
-; FAST-NEXT: setb %al
+; CHECK-LABEL: uaddo.i32
+; CHECK: addl %esi, %edi
+; CHECK-NEXT: setb %al
%t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -185,12 +186,9 @@ entry:
define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
-; DAG-LABEL: uaddo.i64
-; DAG: addq %rsi, %rdi
-; DAG-NEXT: setb %al
-; FAST-LABEL: uaddo.i64
-; FAST: addq %rsi, %rdi
-; FAST-NEXT: setb %al
+; CHECK-LABEL: uaddo.i64
+; CHECK: addq %rsi, %rdi
+; CHECK-NEXT: setb %al
%t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -198,15 +196,57 @@ entry:
ret i1 %obit
}
+; UADDO reg, 1 | NOT INC
+define zeroext i1 @uaddo.inc.i8(i8 %v1, i8* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i8
+; CHECK-NOT: incb %dil
+ %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1)
+ %val = extractvalue {i8, i1} %t, 0
+ %obit = extractvalue {i8, i1} %t, 1
+ store i8 %val, i8* %res
+ ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i16(i16 %v1, i16* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i16
+; CHECK-NOT: incw %di
+ %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1)
+ %val = extractvalue {i16, i1} %t, 0
+ %obit = extractvalue {i16, i1} %t, 1
+ store i16 %val, i16* %res
+ ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i32
+; CHECK-NOT: incl %edi
+ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i64
+; CHECK-NOT: incq %rdi
+ %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
; SSUBO
define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
-; DAG-LABEL: ssubo.i32
-; DAG: subl %esi, %edi
-; DAG-NEXT: seto %al
-; FAST-LABEL: ssubo.i32
-; FAST: subl %esi, %edi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: ssubo.i32
+; CHECK: subl %esi, %edi
+; CHECK-NEXT: seto %al
%t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -216,12 +256,9 @@ entry:
define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
-; DAG-LABEL: ssubo.i64
-; DAG: subq %rsi, %rdi
-; DAG-NEXT: seto %al
-; FAST-LABEL: ssubo.i64
-; FAST: subq %rsi, %rdi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: ssubo.i64
+; CHECK: subq %rsi, %rdi
+; CHECK-NEXT: seto %al
%t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -232,12 +269,9 @@ entry:
; USUBO
define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
-; DAG-LABEL: usubo.i32
-; DAG: subl %esi, %edi
-; DAG-NEXT: setb %al
-; FAST-LABEL: usubo.i32
-; FAST: subl %esi, %edi
-; FAST-NEXT: setb %al
+; CHECK-LABEL: usubo.i32
+; CHECK: subl %esi, %edi
+; CHECK-NEXT: setb %al
%t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -247,12 +281,9 @@ entry:
define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
-; DAG-LABEL: usubo.i64
-; DAG: subq %rsi, %rdi
-; DAG-NEXT: setb %al
-; FAST-LABEL: usubo.i64
-; FAST: subq %rsi, %rdi
-; FAST-NEXT: setb %al
+; CHECK-LABEL: usubo.i64
+; CHECK: subq %rsi, %rdi
+; CHECK-NEXT: setb %al
%t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -263,10 +294,10 @@ entry:
; SMULO
define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
entry:
-; FAST-LABEL: smulo.i8
-; FAST: movb %dil, %al
-; FAST-NEXT: imulb %sil
-; FAST-NEXT: seto %cl
+; CHECK-LABEL: smulo.i8
+; CHECK: movb %dil, %al
+; CHECK-NEXT: imulb %sil
+; CHECK-NEXT: seto %cl
%t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
%val = extractvalue {i8, i1} %t, 0
%obit = extractvalue {i8, i1} %t, 1
@@ -276,12 +307,9 @@ entry:
define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) {
entry:
-; DAG-LABEL: smulo.i16
-; DAG: imulw %si, %di
-; DAG-NEXT: seto %al
-; FAST-LABEL: smulo.i16
-; FAST: imulw %si, %di
-; FAST-NEXT: seto %al
+; CHECK-LABEL: smulo.i16
+; CHECK: imulw %si, %di
+; CHECK-NEXT: seto %al
%t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
%val = extractvalue {i16, i1} %t, 0
%obit = extractvalue {i16, i1} %t, 1
@@ -291,12 +319,9 @@ entry:
define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
-; DAG-LABEL: smulo.i32
-; DAG: imull %esi, %edi
-; DAG-NEXT: seto %al
-; FAST-LABEL: smulo.i32
-; FAST: imull %esi, %edi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: smulo.i32
+; CHECK: imull %esi, %edi
+; CHECK-NEXT: seto %al
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -306,12 +331,9 @@ entry:
define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
-; DAG-LABEL: smulo.i64
-; DAG: imulq %rsi, %rdi
-; DAG-NEXT: seto %al
-; FAST-LABEL: smulo.i64
-; FAST: imulq %rsi, %rdi
-; FAST-NEXT: seto %al
+; CHECK-LABEL: smulo.i64
+; CHECK: imulq %rsi, %rdi
+; CHECK-NEXT: seto %al
%t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -322,10 +344,10 @@ entry:
; UMULO
define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
entry:
-; FAST-LABEL: umulo.i8
-; FAST: movb %dil, %al
-; FAST-NEXT: mulb %sil
-; FAST-NEXT: seto %cl
+; CHECK-LABEL: umulo.i8
+; CHECK: movb %dil, %al
+; CHECK-NEXT: mulb %sil
+; CHECK-NEXT: seto %cl
%t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
%val = extractvalue {i8, i1} %t, 0
%obit = extractvalue {i8, i1} %t, 1
@@ -335,12 +357,9 @@ entry:
define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) {
entry:
-; DAG-LABEL: umulo.i16
-; DAG: mulw %si
-; DAG-NEXT: seto
-; FAST-LABEL: umulo.i16
-; FAST: mulw %si
-; FAST-NEXT: seto
+; CHECK-LABEL: umulo.i16
+; CHECK: mulw %si
+; CHECK-NEXT: seto
%t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
%val = extractvalue {i16, i1} %t, 0
%obit = extractvalue {i16, i1} %t, 1
@@ -350,12 +369,9 @@ entry:
define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
-; DAG-LABEL: umulo.i32
-; DAG: mull %esi
-; DAG-NEXT: seto
-; FAST-LABEL: umulo.i32
-; FAST: mull %esi
-; FAST-NEXT: seto
+; CHECK-LABEL: umulo.i32
+; CHECK: mull %esi
+; CHECK-NEXT: seto
%t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -365,12 +381,9 @@ entry:
define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
-; DAG-LABEL: umulo.i64
-; DAG: mulq %rsi
-; DAG-NEXT: seto
-; FAST-LABEL: umulo.i64
-; FAST: mulq %rsi
-; FAST-NEXT: seto
+; CHECK-LABEL: umulo.i64
+; CHECK: mulq %rsi
+; CHECK-NEXT: seto
%t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -383,9 +396,9 @@ entry:
;
define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: saddo.select.i32
-; CHECK: addl %esi, %eax
-; CHECK-NEXT: cmovol %edi, %esi
+; CHECK-LABEL: saddo.select.i32
+; CHECK: addl %esi, %eax
+; CHECK-NEXT: cmovol %edi, %esi
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
%ret = select i1 %obit, i32 %v1, i32 %v2
@@ -394,9 +407,9 @@ entry:
define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: saddo.select.i64
-; CHECK: addq %rsi, %rax
-; CHECK-NEXT: cmovoq %rdi, %rsi
+; CHECK-LABEL: saddo.select.i64
+; CHECK: addq %rsi, %rax
+; CHECK-NEXT: cmovoq %rdi, %rsi
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
%obit = extractvalue {i64, i1} %t, 1
%ret = select i1 %obit, i64 %v1, i64 %v2
@@ -405,9 +418,9 @@ entry:
define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: uaddo.select.i32
-; CHECK: addl %esi, %eax
-; CHECK-NEXT: cmovbl %edi, %esi
+; CHECK-LABEL: uaddo.select.i32
+; CHECK: addl %esi, %eax
+; CHECK-NEXT: cmovbl %edi, %esi
%t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
%ret = select i1 %obit, i32 %v1, i32 %v2
@@ -416,9 +429,9 @@ entry:
define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: uaddo.select.i64
-; CHECK: addq %rsi, %rax
-; CHECK-NEXT: cmovbq %rdi, %rsi
+; CHECK-LABEL: uaddo.select.i64
+; CHECK: addq %rsi, %rax
+; CHECK-NEXT: cmovbq %rdi, %rsi
%t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
%obit = extractvalue {i64, i1} %t, 1
%ret = select i1 %obit, i64 %v1, i64 %v2
@@ -427,9 +440,9 @@ entry:
define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: ssubo.select.i32
-; CHECK: cmpl %esi, %edi
-; CHECK-NEXT: cmovol %edi, %esi
+; CHECK-LABEL: ssubo.select.i32
+; CHECK: cmpl %esi, %edi
+; CHECK-NEXT: cmovol %edi, %esi
%t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
%ret = select i1 %obit, i32 %v1, i32 %v2
@@ -438,9 +451,9 @@ entry:
define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: ssubo.select.i64
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovoq %rdi, %rsi
+; CHECK-LABEL: ssubo.select.i64
+; CHECK: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovoq %rdi, %rsi
%t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
%obit = extractvalue {i64, i1} %t, 1
%ret = select i1 %obit, i64 %v1, i64 %v2
@@ -449,9 +462,9 @@ entry:
define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: usubo.select.i32
-; CHECK: cmpl %esi, %edi
-; CHECK-NEXT: cmovbl %edi, %esi
+; CHECK-LABEL: usubo.select.i32
+; CHECK: cmpl %esi, %edi
+; CHECK-NEXT: cmovbl %edi, %esi
%t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
%ret = select i1 %obit, i32 %v1, i32 %v2
@@ -460,9 +473,9 @@ entry:
define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: usubo.select.i64
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovbq %rdi, %rsi
+; CHECK-LABEL: usubo.select.i64
+; CHECK: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovbq %rdi, %rsi
%t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
%obit = extractvalue {i64, i1} %t, 1
%ret = select i1 %obit, i64 %v1, i64 %v2
@@ -471,9 +484,9 @@ entry:
define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: smulo.select.i32
-; CHECK: imull %esi, %eax
-; CHECK-NEXT: cmovol %edi, %esi
+; CHECK-LABEL: smulo.select.i32
+; CHECK: imull %esi, %eax
+; CHECK-NEXT: cmovol %edi, %esi
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
%ret = select i1 %obit, i32 %v1, i32 %v2
@@ -482,9 +495,9 @@ entry:
define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: smulo.select.i64
-; CHECK: imulq %rsi, %rax
-; CHECK-NEXT: cmovoq %rdi, %rsi
+; CHECK-LABEL: smulo.select.i64
+; CHECK: imulq %rsi, %rax
+; CHECK-NEXT: cmovoq %rdi, %rsi
%t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
%obit = extractvalue {i64, i1} %t, 1
%ret = select i1 %obit, i64 %v1, i64 %v2
@@ -493,9 +506,9 @@ entry:
define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: umulo.select.i32
-; CHECK: mull %esi
-; CHECK-NEXT: cmovol %edi, %esi
+; CHECK-LABEL: umulo.select.i32
+; CHECK: mull %esi
+; CHECK-NEXT: cmovol %edi, %esi
%t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
%ret = select i1 %obit, i32 %v1, i32 %v2
@@ -504,9 +517,9 @@ entry:
define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: umulo.select.i64
-; CHECK: mulq %rsi
-; CHECK-NEXT: cmovoq %rdi, %rsi
+; CHECK-LABEL: umulo.select.i64
+; CHECK: mulq %rsi
+; CHECK-NEXT: cmovoq %rdi, %rsi
%t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
%obit = extractvalue {i64, i1} %t, 1
%ret = select i1 %obit, i64 %v1, i64 %v2
@@ -519,9 +532,9 @@ entry:
;
define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: saddo.br.i32
-; CHECK: addl %esi, %edi
-; CHECK-NEXT: jo
+; CHECK-LABEL: saddo.br.i32
+; CHECK: addl %esi, %edi
+; CHECK-NEXT: jo
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -536,9 +549,9 @@ continue:
define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: saddo.br.i64
-; CHECK: addq %rsi, %rdi
-; CHECK-NEXT: jo
+; CHECK-LABEL: saddo.br.i64
+; CHECK: addq %rsi, %rdi
+; CHECK-NEXT: jo
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -553,9 +566,9 @@ continue:
define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: uaddo.br.i32
-; CHECK: addl %esi, %edi
-; CHECK-NEXT: jb
+; CHECK-LABEL: uaddo.br.i32
+; CHECK: addl %esi, %edi
+; CHECK-NEXT: jb
%t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -570,9 +583,9 @@ continue:
define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: uaddo.br.i64
-; CHECK: addq %rsi, %rdi
-; CHECK-NEXT: jb
+; CHECK-LABEL: uaddo.br.i64
+; CHECK: addq %rsi, %rdi
+; CHECK-NEXT: jb
%t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -587,9 +600,9 @@ continue:
define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: ssubo.br.i32
-; CHECK: cmpl %esi, %edi
-; CHECK-NEXT: jo
+; CHECK-LABEL: ssubo.br.i32
+; CHECK: cmpl %esi, %edi
+; CHECK-NEXT: jo
%t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -604,9 +617,9 @@ continue:
define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: ssubo.br.i64
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jo
+; CHECK-LABEL: ssubo.br.i64
+; CHECK: cmpq %rsi, %rdi
+; CHECK-NEXT: jo
%t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -621,9 +634,9 @@ continue:
define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: usubo.br.i32
-; CHECK: cmpl %esi, %edi
-; CHECK-NEXT: jb
+; CHECK-LABEL: usubo.br.i32
+; CHECK: cmpl %esi, %edi
+; CHECK-NEXT: jb
%t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -638,9 +651,9 @@ continue:
define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: usubo.br.i64
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jb
+; CHECK-LABEL: usubo.br.i64
+; CHECK: cmpq %rsi, %rdi
+; CHECK-NEXT: jb
%t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -655,9 +668,9 @@ continue:
define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: smulo.br.i32
-; CHECK: imull %esi, %edi
-; CHECK-NEXT: jo
+; CHECK-LABEL: smulo.br.i32
+; CHECK: imull %esi, %edi
+; CHECK-NEXT: jo
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -672,9 +685,9 @@ continue:
define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: smulo.br.i64
-; CHECK: imulq %rsi, %rdi
-; CHECK-NEXT: jo
+; CHECK-LABEL: smulo.br.i64
+; CHECK: imulq %rsi, %rdi
+; CHECK-NEXT: jo
%t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -689,9 +702,9 @@ continue:
define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
entry:
-; CHECK-LABEL: umulo.br.i32
-; CHECK: mull %esi
-; CHECK-NEXT: jo
+; CHECK-LABEL: umulo.br.i32
+; CHECK: mull %esi
+; CHECK-NEXT: jo
%t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
@@ -706,9 +719,9 @@ continue:
define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
entry:
-; CHECK-LABEL: umulo.br.i64
-; CHECK: mulq %rsi
-; CHECK-NEXT: jo
+; CHECK-LABEL: umulo.br.i64
+; CHECK: mulq %rsi
+; CHECK-NEXT: jo
%t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
@@ -725,6 +738,8 @@ declare {i8, i1} @llvm.sadd.with.overflow.i8 (i8, i8 ) nounwind readnone
declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8, i1} @llvm.uadd.with.overflow.i8 (i8, i8 ) nounwind readnone
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) nounwind readnone
declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone