diff options
author | Stephen Hines <srhines@google.com> | 2014-12-01 14:51:49 -0800 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-12-02 16:08:10 -0800 |
commit | 37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch) | |
tree | 8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /test/CodeGen/X86 | |
parent | d2327b22152ced7bc46dc629fc908959e8a52d03 (diff) | |
download | external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2 |
Update aosp/master LLVM for rebase to r222494.
Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d
Diffstat (limited to 'test/CodeGen/X86')
355 files changed, 33385 insertions, 8947 deletions
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll index 638d399..62c503d 100644 --- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll +++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s -@__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once ; <i32 (i32*, void ()*)*> [#uses=0] +@__gthrw_pthread_once = weak alias i32 (i32*, void ()*)* @pthread_once ; <i32 (i32*, void ()*)*> [#uses=0] define weak i32 @pthread_once(i32*, void ()*) { ret i32 0 diff --git a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll index d2d5149..35857b7 100644 --- a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll +++ b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xor | grep CPI +; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; CHECK: xorpd {{.*}}{{LCPI0_0|__xmm@}} define void @casin({ double, double }* sret %agg.result, double %z.0, double %z.1) nounwind { entry: %memtmp = alloca { double, double }, align 8 ; <{ double, double }*> [#uses=3] diff --git a/test/CodeGen/X86/2008-06-18-BadShuffle.ll b/test/CodeGen/X86/2008-06-18-BadShuffle.ll deleted file mode 100644 index 66f9065..0000000 --- a/test/CodeGen/X86/2008-06-18-BadShuffle.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse2 | grep pinsrw - -; Test to make sure we actually insert the bottom element of the vector -define <8 x i16> @a(<8 x i16> %a) nounwind { -entry: - shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> < i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 > - %add = add <8 x i16> %0, %a - ret <8 x i16> %add -} - diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll index 296f0ca..207d122 100644 --- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll +++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll @@ -14,9 +14,9 @@ entry: %2 = alloca i64 ; <i64*> [#uses=1] %3 = alloca i64 ; <i64*> [#uses=6] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0), !dbg !7 + call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !7 store i8* %s1, i8** %s1_addr - call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8), !dbg !7 + call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8, metadata !{metadata !"0x102"}), !dbg !7 %4 = call i8* @llvm.stacksave(), !dbg !7 ; <i8*> [#uses=1] store i8* %4, i8** %saved_stack.1, align 8, !dbg !7 %5 = load i8** %s1_addr, align 8, !dbg !13 ; <i8*> [#uses=1] @@ -58,7 +58,7 @@ return: ; preds = %entry ret i8 %retval12, !dbg !16 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone declare i8* @llvm.stacksave() nounwind @@ -66,21 +66,21 @@ declare i64 @strlen(i8*) nounwind readonly declare void @llvm.stackrestore(i8*) nounwind -!0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x101\00s1\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ] +!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, metadata !2, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ] +!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !4 = metadata !{metadata !5, metadata !6} -!5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ] +!5 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ] +!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !5} ; [ DW_TAG_pointer_type ] !7 = metadata !{i32 2, i32 0, metadata !1, null} -!8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ] -!9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ] -!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char] +!8 = metadata !{metadata !"0x100\00str.0\003\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ] +!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ] +!10 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, metadata !2, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char] !11 = metadata !{metadata !12} -!12 = metadata !{i32 458785, i64 0, i64 1} ; [ DW_TAG_subrange_type ] +!12 = metadata !{metadata !"0x21\000\001"} ; [ DW_TAG_subrange_type ] !13 = metadata !{i32 3, i32 0, metadata !14, null} -!14 = metadata !{i32 458763, metadata !17, metadata !1, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] +!14 = metadata !{metadata !"0xb\000\000\000", metadata !17, metadata !1} ; [ DW_TAG_lexical_block ] !15 = metadata !{i32 4, i32 0, metadata !14, null} !16 = metadata !{i32 5, i32 0, metadata !14, null} !17 = metadata !{metadata !"vla.c", metadata !"/tmp/"} diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll index 764c2cd..e046b96 100644 --- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm" +; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm" ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s ; rdar://6627786 ; rdar://7792037 diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll deleted file mode 100644 index e1930e0..0000000 --- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \ -; RUN: -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \ -; RUN: FileCheck %s -; rdar://6808032 - -; CHECK: pextrw $14 -; CHECK-NEXT: shrl $8 -; CHECK-NEXT: pinsrw - -define void @update(i8** %args_list) nounwind { -entry: - %cmp.i = icmp eq i32 0, 0 ; <i1> [#uses=1] - br i1 %cmp.i, label %if.then.i, label %test_cl.exit - -if.then.i: ; preds = %entry - %val = load <16 x i8> addrspace(1)* null ; <<16 x i8>> [#uses=8] - %tmp10.i = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 undef, i8 0, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef>, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 29, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp17.i = shufflevector <16 x i8> %tmp10.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 18, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp24.i = shufflevector <16 x i8> %tmp17.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 24, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp31.i = shufflevector <16 x i8> %tmp24.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 21, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp38.i = shufflevector <16 x i8> %tmp31.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 27, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp45.i = shufflevector <16 x i8> %tmp38.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 29, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp52.i = shufflevector <16 x i8> %tmp45.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 21, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp59.i = shufflevector <16 x i8> %tmp52.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 20> ; <<16 x i8>> [#uses=1] - store <16 x i8> %tmp59.i, <16 x i8> addrspace(1)* null - ret void - -test_cl.exit: ; preds = %entry - ret void -} diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll index 50c62df..ffbe02c 100644 --- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll +++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll @@ -1,9 +1,11 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 > %t1 -; RUN: grep movzwl %t1 | count 2 -; RUN: grep movzbl %t1 | count 1 -; RUN: grep movd %t1 | count 4 +; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s define <4 x i16> @a(i32* %x1) nounwind { +; CHECK-LABEL: a: +; CHECK: shrl %[[R:[^,]+]] +; CHECK-NEXT: movd %[[R]], %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i16 @@ -12,6 +14,12 @@ define <4 x i16> @a(i32* %x1) nounwind { } define <8 x i16> @b(i32* %x1) nounwind { +; CHECK-LABEL: b: +; CHECK: shrl %e[[R:.]]x +; CHECK-NEXT: movzwl %[[R]]x, %e[[R]]x +; CHECK-NEXT: movd %e[[R]]x, %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i16 @@ -20,6 +28,12 @@ define <8 x i16> @b(i32* %x1) nounwind { } define <8 x i8> @c(i32* %x1) nounwind { +; CHECK-LABEL: c: +; CHECK: shrl %e[[R:.]]x +; CHECK-NEXT: movzwl %[[R]]x, %e[[R]]x +; CHECK-NEXT: movd %e[[R]]x, %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i8 @@ -28,6 +42,12 @@ define <8 x i8> @c(i32* %x1) nounwind { } define <16 x i8> @d(i32* %x1) nounwind { +; CHECK-LABEL: d: +; CHECK: shrl %e[[R:.]]x +; CHECK-NEXT: movzbl %[[R]]l, %e[[R]]x +; CHECK-NEXT: movd %e[[R]]x, %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i8 diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll index a936edc..6fe2ee4 100644 --- a/test/CodeGen/X86/2009-10-16-Scope.ll +++ b/test/CodeGen/X86/2009-10-16-Scope.ll @@ -9,7 +9,7 @@ entry: br label %do.body, !dbg !0 do.body: ; preds = %entry - call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4) + call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"}) %conv = ptrtoint i32* %count_ to i32, !dbg !0 ; <i32> [#uses=1] %call = call i32 @foo(i32 %conv) ssp, !dbg !0 ; <i32> [#uses=0] br label %do.end, !dbg !0 @@ -18,17 +18,17 @@ do.end: ; preds = %do.body ret void, !dbg !7 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone declare i32 @foo(i32) ssp !0 = metadata !{i32 5, i32 2, metadata !1, null} -!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ] -!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ] -!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ] -!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ] -!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ] +!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ] +!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ] +!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ] +!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ] +!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ] !7 = metadata !{i32 6, i32 1, metadata !2, null} !8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"} !9 = metadata !{i32 0} diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll index f99e682..0e2ed9d 100644 --- a/test/CodeGen/X86/2010-01-18-DbgValue.ll +++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll @@ -12,7 +12,7 @@ entry: %retval = alloca double ; <double*> [#uses=2] %0 = alloca double ; <double*> [#uses=2] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15 + call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15 %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1] %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1] %3 = load double* %2, align 8, !dbg !16 ; <double> [#uses=1] @@ -26,30 +26,30 @@ return: ; preds = %entry ret double %retval1, !dbg !16 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!21} -!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ] +!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ] +!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null, null} ; [ DW_TAG_compile_unit ] +!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !5 = metadata !{metadata !6, metadata !7} -!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] -!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ] +!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ] +!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ] !8 = metadata !{metadata !9, metadata !14} -!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ] -!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ] +!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ] +!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ] !11 = metadata !{metadata !12, metadata !13} -!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ] -!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ] -!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ] +!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ] +!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ] +!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ] !15 = metadata !{i32 11, i32 0, metadata !1, null} !16 = metadata !{i32 12, i32 0, metadata !17, null} -!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ] +!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ] !18 = metadata !{metadata !1} !19 = metadata !{metadata !"b2.c", metadata !"/tmp/"} !20 = metadata !{i32 0} -!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll index 4d4e8c1..a35efdc 100644 --- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll +++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll @@ -8,28 +8,28 @@ define i32 @"main(tart.core.String[])->int32"(i32 %args) { entry: - tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8) + tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8, metadata !{metadata !"0x102"}) tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2] ret i32 3 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone -!0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ] -!1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ] -!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ] +!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ] +!1 = metadata !{metadata !"0x26\00\000\00192\0064\000\000", metadata !15, metadata !0, metadata !2} ; [ DW_TAG_const_type ] +!2 = metadata !{metadata !"0x13\00C\001\00192\0064\000\000\000", metadata !15, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ] !3 = metadata !{metadata !4, metadata !6, metadata !7} -!4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ] -!5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"y", i32 1, i64 64, i64 64, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ] -!7 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"z", i32 1, i64 64, i64 64, i64 128, i32 0, metadata !5} ; [ DW_TAG_member ] -!8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ] -!9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!4 = metadata !{metadata !"0xd\00x\001\0064\0064\000\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ] +!5 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !15, metadata !0} ; [ DW_TAG_base_type ] +!6 = metadata !{metadata !"0xd\00y\001\0064\0064\0064\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ] +!7 = metadata !{metadata !"0xd\00z\001\0064\0064\00128\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ] +!8 = metadata !{metadata !"0x100\00t\005\000", metadata !9, metadata !0, metadata !2} ; [ DW_TAG_auto_variable ] +!9 = metadata !{metadata !"0xb\000\000\000", null, metadata !10} ; [ DW_TAG_lexical_block ] +!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !0, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !0, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !12 = metadata !{metadata !13} -!13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] +!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !0} ; [ DW_TAG_base_type ] !14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest} !15 = metadata !{metadata !"sm.c", metadata !""} !16 = metadata !{i32 0} diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll index 5372bc5..60025bf 100644 --- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll +++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll @@ -7,7 +7,7 @@ entry: %tmp1 = bitcast double %a to <8 x i8> %tmp2 = bitcast double %b to <8 x i8> %tmp3 = add <8 x i8> %tmp1, %tmp2 -; CHECK: paddw +; CHECK: paddb store <8 x i8> %tmp3, <8 x i8>* null ret void } @@ -18,7 +18,7 @@ entry: %tmp1 = bitcast double %a to <4 x i16> %tmp2 = bitcast double %b to <4 x i16> %tmp3 = add <4 x i16> %tmp1, %tmp2 -; CHECK: paddd +; CHECK: paddw store <4 x i16> %tmp3, <4 x i16>* null ret void } @@ -29,7 +29,7 @@ entry: %tmp1 = bitcast double %a to <2 x i32> %tmp2 = bitcast double %b to <2 x i32> %tmp3 = add <2 x i32> %tmp1, %tmp2 -; CHECK: paddq +; CHECK: paddd store <2 x i32> %tmp3, <2 x i32>* null ret void } diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll index 7faee99..1998011 100644 --- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll +++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll @@ -2,8 +2,7 @@ ; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s ; Test to check .debug_loc support. This test case emits many debug_loc entries. -; CHECK: Loc expr size -; CHECK-NEXT: .short +; CHECK: .short {{.*}} # Loc expr size ; CHECK-NEXT: .Ltmp ; CHECK-NEXT: DW_OP_reg @@ -11,10 +10,10 @@ define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone { entry: - tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0) - tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11) - tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12) - tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13) + tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0, metadata !{metadata !"0x102"}) + tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11, metadata !{metadata !"0x102"}) + tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12, metadata !{metadata !"0x102"}) + tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13, metadata !{metadata !"0x102"}) %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1] %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1] %2 = fcmp olt float %0, %1, !dbg !19 ; <i1> [#uses=1] @@ -22,34 +21,34 @@ entry: bb: ; preds = %entry %3 = fdiv float %c, %d, !dbg !20 ; <float> [#uses=3] - tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16), !dbg !20 + tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !20 %4 = fmul float %3, %c, !dbg !21 ; <float> [#uses=1] %5 = fadd float %4, %d, !dbg !21 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14), !dbg !21 + tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21 %6 = fmul float %3, %a, !dbg !22 ; <float> [#uses=1] %7 = fadd float %6, %b, !dbg !22 ; <float> [#uses=1] %8 = fdiv float %7, %5, !dbg !22 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17), !dbg !22 + tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !22 %9 = fmul float %3, %b, !dbg !23 ; <float> [#uses=1] %10 = fsub float %9, %a, !dbg !23 ; <float> [#uses=1] %11 = fdiv float %10, %5, !dbg !23 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18), !dbg !23 + tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !23 br label %bb2, !dbg !23 bb1: ; preds = %entry %12 = fdiv float %d, %c, !dbg !24 ; <float> [#uses=3] - tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16), !dbg !24 + tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !24 %13 = fmul float %12, %d, !dbg !25 ; <float> [#uses=1] %14 = fadd float %13, %c, !dbg !25 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14), !dbg !25 + tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !25 %15 = fmul float %12, %b, !dbg !26 ; <float> [#uses=1] %16 = fadd float %15, %a, !dbg !26 ; <float> [#uses=1] %17 = fdiv float %16, %14, !dbg !26 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17), !dbg !26 + tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !26 %18 = fmul float %12, %a, !dbg !27 ; <float> [#uses=1] %19 = fsub float %b, %18, !dbg !27 ; <float> [#uses=1] %20 = fdiv float %19, %14, !dbg !27 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18), !dbg !27 + tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !27 br label %bb2, !dbg !27 bb2: ; preds = %bb1, %bb @@ -75,9 +74,9 @@ bb6: ; preds = %bb4 bb8: ; preds = %bb6 %27 = tail call float @copysignf(float 0x7FF0000000000000, float %c) nounwind readnone, !dbg !30 ; <float> [#uses=2] %28 = fmul float %27, %a, !dbg !30 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17), !dbg !30 + tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30 %29 = fmul float %27, %b, !dbg !31 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18), !dbg !31 + tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !31 br label %bb46, !dbg !31 bb9: ; preds = %bb6, %bb4 @@ -107,24 +106,24 @@ bb15: ; preds = %bb14 bb16: ; preds = %bb15 %iftmp.0.0 = select i1 %33, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %42 = tail call float @copysignf(float %iftmp.0.0, float %a) nounwind readnone, !dbg !33 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0), !dbg !33 + tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !33 %43 = fcmp ord float %b, 0.000000e+00 ; <i1> [#uses=1] %44 = fsub float %b, %b, !dbg !34 ; <float> [#uses=1] %45 = fcmp uno float %44, 0.000000e+00 ; <i1> [#uses=1] %46 = and i1 %43, %45, !dbg !34 ; <i1> [#uses=1] %iftmp.1.0 = select i1 %46, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %47 = tail call float @copysignf(float %iftmp.1.0, float %b) nounwind readnone, !dbg !34 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11), !dbg !34 + tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !34 %48 = fmul float %42, %c, !dbg !35 ; <float> [#uses=1] %49 = fmul float %47, %d, !dbg !35 ; <float> [#uses=1] %50 = fadd float %48, %49, !dbg !35 ; <float> [#uses=1] %51 = fmul float %50, 0x7FF0000000000000, !dbg !35 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17), !dbg !35 + tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35 %52 = fmul float %47, %c, !dbg !36 ; <float> [#uses=1] %53 = fmul float %42, %d, !dbg !36 ; <float> [#uses=1] %54 = fsub float %52, %53, !dbg !36 ; <float> [#uses=1] %55 = fmul float %54, 0x7FF0000000000000, !dbg !36 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18), !dbg !36 + tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !36 br label %bb46, !dbg !36 bb27: ; preds = %bb15, %bb14, %bb11 @@ -155,24 +154,24 @@ bb34: ; preds = %bb33, %bb30 bb35: ; preds = %bb34 %iftmp.2.0 = select i1 %59, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %67 = tail call float @copysignf(float %iftmp.2.0, float %c) nounwind readnone, !dbg !38 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12), !dbg !38 + tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !38 %68 = fcmp ord float %d, 0.000000e+00 ; <i1> [#uses=1] %69 = fsub float %d, %d, !dbg !39 ; <float> [#uses=1] %70 = fcmp uno float %69, 0.000000e+00 ; <i1> [#uses=1] %71 = and i1 %68, %70, !dbg !39 ; <i1> [#uses=1] %iftmp.3.0 = select i1 %71, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %72 = tail call float @copysignf(float %iftmp.3.0, float %d) nounwind readnone, !dbg !39 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13), !dbg !39 + tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !39 %73 = fmul float %67, %a, !dbg !40 ; <float> [#uses=1] %74 = fmul float %72, %b, !dbg !40 ; <float> [#uses=1] %75 = fadd float %73, %74, !dbg !40 ; <float> [#uses=1] %76 = fmul float %75, 0.000000e+00, !dbg !40 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17), !dbg !40 + tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !40 %77 = fmul float %67, %b, !dbg !41 ; <float> [#uses=1] %78 = fmul float %72, %a, !dbg !41 ; <float> [#uses=1] %79 = fsub float %77, %78, !dbg !41 ; <float> [#uses=1] %80 = fmul float %79, 0.000000e+00, !dbg !41 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18), !dbg !41 + tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !41 br label %bb46, !dbg !41 bb46: ; preds = %bb35, %bb34, %bb33, %bb30, %bb16, %bb8, %bb2 @@ -196,30 +195,30 @@ declare float @fabsf(float) declare float @copysignf(float, float) nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!48} -!0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x101\00a\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ] +!1 = metadata !{metadata !"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", metadata !45, metadata !2, metadata !4, null, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43} ; [ DW_TAG_subprogram ] +!2 = metadata !{metadata !"0x29", metadata !45} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !45, metadata !47, metadata !47, metadata !44, null, null} ; [ DW_TAG_compile_unit ] +!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !45, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9} -!6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] -!7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ] -!8 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"complex float", i32 0, i64 64, i64 32, i64 0, i32 0, i32 3} ; [ DW_TAG_base_type ] -!9 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SFtype", i32 167, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ] -!10 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] -!11 = metadata !{i32 786689, metadata !1, metadata !"b", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!12 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!13 = metadata !{i32 786689, metadata !1, metadata !"d", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!14 = metadata !{i32 786688, metadata !15, metadata !"denom", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!15 = metadata !{i32 786443, metadata !45, metadata !1, i32 1922, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!16 = metadata !{i32 786688, metadata !15, metadata !"ratio", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!17 = metadata !{i32 786688, metadata !15, metadata !"x", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!18 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] +!6 = metadata !{metadata !"0x16\00SCtype\00170\000\000\000\000", metadata !46, metadata !7, metadata !8} ; [ DW_TAG_typedef ] +!7 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ] +!8 = metadata !{metadata !"0x24\00complex float\000\0064\0032\000\000\003", metadata !45, metadata !2} ; [ DW_TAG_base_type ] +!9 = metadata !{metadata !"0x16\00SFtype\00167\000\000\000\000", metadata !46, metadata !7, metadata !10} ; [ DW_TAG_typedef ] +!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", metadata !45, metadata !2} ; [ DW_TAG_base_type ] +!11 = metadata !{metadata !"0x101\00b\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ] +!12 = metadata !{metadata !"0x101\00c\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ] +!13 = metadata !{metadata !"0x101\00d\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ] +!14 = metadata !{metadata !"0x100\00denom\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ] +!15 = metadata !{metadata !"0xb\001922\000\000", metadata !45, metadata !1} ; [ DW_TAG_lexical_block ] +!16 = metadata !{metadata !"0x100\00ratio\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ] +!17 = metadata !{metadata !"0x100\00x\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ] +!18 = metadata !{metadata !"0x100\00y\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ] !19 = metadata !{i32 1929, i32 0, metadata !15, null} !20 = metadata !{i32 1931, i32 0, metadata !15, null} !21 = metadata !{i32 1932, i32 0, metadata !15, null} @@ -249,4 +248,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone !45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"} !46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"} !47 = metadata !{i32 0} -!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll index e11b538..09120a1 100644 --- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll +++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll @@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10" define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp { entry: - tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8) + tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8, metadata !{metadata !"0x102"}) %0 = getelementptr inbounds %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1] %1 = load i32* %0, align 8, !dbg !28 ; <i32> [#uses=1] tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28 @@ -19,41 +19,41 @@ entry: declare void @foo(i32) nounwind optsize noinline ssp -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!38} -!0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ] -!1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31, metadata !37, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ] -!5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ] -!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x34\00ret\00ret\00\007\000\001", metadata !1, metadata !1, metadata !3, null, null} ; [ DW_TAG_variable ] +!1 = metadata !{metadata !"0x29", metadata !36} ; [ DW_TAG_file_type ] +!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !36, metadata !37, metadata !37, metadata !32, metadata !31, metadata !37} ; [ DW_TAG_compile_unit ] +!3 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !36, metadata !1} ; [ DW_TAG_base_type ] +!4 = metadata !{metadata !"0x101\00x\0012\000", metadata !5, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ] +!5 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", metadata !36, metadata !1, metadata !6, null, void (i32)* @foo, null, null, metadata !33} ; [ DW_TAG_subprogram ] +!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !7 = metadata !{null, metadata !3} -!8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ] -!9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ] -!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = metadata !{metadata !"0x101\00myvar\0017\000", metadata !9, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ] +!9 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", metadata !36, metadata !1, metadata !10, null, i8* (%struct.a*)* @bar, null, null, metadata !34} ; [ DW_TAG_subprogram ] +!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !11 = metadata !{metadata !12, metadata !13} -!12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] -!13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] -!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ] +!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, null} ; [ DW_TAG_pointer_type ] +!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ] +!14 = metadata !{metadata !"0x13\00a\002\00128\0064\000\000\000", metadata !36, metadata !1, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ] !15 = metadata !{metadata !16, metadata !17} -!16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ] -!17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ] -!18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ] -!19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ] -!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!16 = metadata !{metadata !"0xd\00c\003\0032\0032\000\000", metadata !36, metadata !14, metadata !3} ; [ DW_TAG_member ] +!17 = metadata !{metadata !"0xd\00d\004\0064\0064\0064\000", metadata !36, metadata !14, metadata !13} ; [ DW_TAG_member ] +!18 = metadata !{metadata !"0x101\00argc\0022\000", metadata !19, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ] +!19 = metadata !{metadata !"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", metadata !36, metadata !1, metadata !20, null, null, null, null, metadata !35} ; [ DW_TAG_subprogram ] +!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !21 = metadata !{metadata !3, metadata !3, metadata !22} -!22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ] -!23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] -!24 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!25 = metadata !{i32 786689, metadata !19, metadata !"argv", metadata !1, i32 22, metadata !22, i32 0, null} ; [ DW_TAG_arg_variable ] -!26 = metadata !{i32 786688, metadata !27, metadata !"e", metadata !1, i32 23, metadata !14, i32 0, null} ; [ DW_TAG_auto_variable ] -!27 = metadata !{i32 786443, metadata !36, metadata !19, i32 22, i32 0, i32 0} ; [ DW_TAG_lexical_block ] +!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !23} ; [ DW_TAG_pointer_type ] +!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !24} ; [ DW_TAG_pointer_type ] +!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !36, metadata !1} ; [ DW_TAG_base_type ] +!25 = metadata !{metadata !"0x101\00argv\0022\000", metadata !19, metadata !1, metadata !22} ; [ DW_TAG_arg_variable ] +!26 = metadata !{metadata !"0x100\00e\0023\000", metadata !27, metadata !1, metadata !14} ; [ DW_TAG_auto_variable ] +!27 = metadata !{metadata !"0xb\0022\000\000", metadata !36, metadata !19} ; [ DW_TAG_lexical_block ] !28 = metadata !{i32 18, i32 0, metadata !29, null} -!29 = metadata !{i32 786443, metadata !36, metadata !9, i32 17, i32 0, i32 1} ; [ DW_TAG_lexical_block ] +!29 = metadata !{metadata !"0xb\0017\000\001", metadata !36, metadata !9} ; [ DW_TAG_lexical_block ] !30 = metadata !{i32 19, i32 0, metadata !29, null} !31 = metadata !{metadata !0} !32 = metadata !{metadata !5, metadata !9, metadata !19} @@ -73,18 +73,22 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone ; CHECK: Ldebug_loc0: -; CHECK-NEXT: .quad Lfunc_begin0 -; CHECK-NEXT: .quad [[LABEL]] +; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET1]] +; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET2]] ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}} ## Loc expr size ; CHECK-NEXT: .short Lset{{.*}} ; CHECK-NEXT: Ltmp{{.*}}: ; CHECK-NEXT: .byte 85 ; CHECK-NEXT: Ltmp{{.*}}: -; CHECK-NEXT: .quad [[LABEL]] -; CHECK-NEXT: .quad [[CLOBBER]] +; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET3]] +; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET4]] ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}} ## Loc expr size ; CHECK-NEXT: .short Lset{{.*}} ; CHECK-NEXT: Ltmp{{.*}}: ; CHECK-NEXT: .byte 83 ; CHECK-NEXT: Ltmp{{.*}}: -!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll index 1114c8d..b0a4e8d 100644 --- a/test/CodeGen/X86/2010-05-28-Crash.ll +++ b/test/CodeGen/X86/2010-05-28-Crash.ll @@ -4,19 +4,19 @@ define i32 @foo(i32 %y) nounwind optsize ssp { entry: - tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0) + tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0, metadata !{metadata !"0x102"}) %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1] ret i32 %0, !dbg !9 } declare i32 @zoo(...) -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone define i32 @bar(i32 %x) nounwind optsize ssp { entry: - tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7) - tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0) nounwind + tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7, metadata !{metadata !"0x102"}) + tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0, metadata !{metadata !"0x102"}) nounwind %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1] %1 = add nsw i32 %0, %x, !dbg !13 ; <i32> [#uses=1] ret i32 %1, !dbg !13 @@ -25,21 +25,21 @@ entry: !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!20} -!0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x101\00y\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ] +!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ] +!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !17, null, null} ; [ DW_TAG_compile_unit ] +!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !5 = metadata !{metadata !6, metadata !6} -!6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ] -!8 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 6, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @bar, null, null, metadata !16, i32 6} ; [ DW_TAG_subprogram ] +!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ] +!7 = metadata !{metadata !"0x101\00x\006\000", metadata !8, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ] +!8 = metadata !{metadata !"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @bar, null, null, metadata !16} ; [ DW_TAG_subprogram ] !9 = metadata !{i32 3, i32 0, metadata !10, null} -!10 = metadata !{i32 786443, metadata !18, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ] +!10 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ] !11 = metadata !{i32 1} !12 = metadata !{i32 3, i32 0, metadata !10, metadata !13} !13 = metadata !{i32 7, i32 0, metadata !14, null} -!14 = metadata !{i32 786443, metadata !18, metadata !8, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] +!14 = metadata !{metadata !"0xb\006\000\000", metadata !18, metadata !8} ; [ DW_TAG_lexical_block ] !15 = metadata !{metadata !0} !16 = metadata !{metadata !7} !17 = metadata !{metadata !1, metadata !8} @@ -49,4 +49,4 @@ entry: ;CHECK: DEBUG_VALUE: bar:x <- E ;CHECK: Ltmp ;CHECK: DEBUG_VALUE: foo:y <- 1{{$}} -!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll index 4181c26..dea9162 100644 --- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll +++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll @@ -10,51 +10,51 @@ target triple = "x86_64-apple-darwin10.2" define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 { ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}} entry: - tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15) - tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16) + tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"}) + tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16, metadata !{metadata !"0x102"}) %0 = mul nsw i32 %x, 7, !dbg !29 ; <i32> [#uses=1] %1 = add nsw i32 %0, 1, !dbg !29 ; <i32> [#uses=1] ret i32 %1, !dbg !29 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!4} !llvm.module.flags = !{!34} !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28} -!0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* null, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ] -!3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ] -!4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ] +!0 = metadata !{metadata !"0x101\00this\0011\000", metadata !1, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ] +!1 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ] +!2 = metadata !{metadata !"0x13\00foo\003\0032\0032\000\000\000", metadata !31, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ] +!3 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ] +!4 = metadata !{metadata !"0x11\004\004.2.1 LLVM build\001\00\000\00\000", metadata !31, metadata !32, metadata !32, metadata !33, null, null} ; [ DW_TAG_compile_unit ] !5 = metadata !{metadata !6, metadata !1, metadata !8} -!6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] -!7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ] -!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!6 = metadata !{metadata !"0xd\00y\008\0032\0032\000\000", metadata !31, metadata !2, metadata !7} ; [ DW_TAG_member ] +!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !31, metadata !3} ; [ DW_TAG_base_type ] +!8 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ] +!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !10 = metadata !{metadata !7, metadata !11, metadata !7} -!11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ] -!12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ] -!13 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_pointer_type ] -!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !3, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!15 = metadata !{i32 786689, metadata !8, metadata !"this", metadata !3, i32 15, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ] -!16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ] -!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ] +!12 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !13} ; [ DW_TAG_const_type ] +!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ] +!14 = metadata !{metadata !"0x101\00x\0011\000", metadata !1, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ] +!15 = metadata !{metadata !"0x101\00this\0015\000", metadata !8, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ] +!16 = metadata !{metadata !"0x101\00x\0015\000", metadata !8, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ] +!17 = metadata !{metadata !"0x101\00argc\0019\000", metadata !18, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ] +!18 = metadata !{metadata !"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", metadata !31, metadata !3, metadata !19, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !20 = metadata !{metadata !7, metadata !7, metadata !21} -!21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ] -!22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ] -!23 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!24 = metadata !{i32 786689, metadata !18, metadata !"argv", metadata !3, i32 19, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ] -!25 = metadata !{i32 786688, metadata !26, metadata !"a", metadata !3, i32 20, metadata !2, i32 0, null} ; [ DW_TAG_auto_variable ] -!26 = metadata !{i32 786443, metadata !31, metadata !27, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!27 = metadata !{i32 786443, metadata !31, metadata !18, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!28 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !3, i32 21, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ] +!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !22} ; [ DW_TAG_pointer_type ] +!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !23} ; [ DW_TAG_pointer_type ] +!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !31, metadata !3} ; [ DW_TAG_base_type ] +!24 = metadata !{metadata !"0x101\00argv\0019\000", metadata !18, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ] +!25 = metadata !{metadata !"0x100\00a\0020\000", metadata !26, metadata !3, metadata !2} ; [ DW_TAG_auto_variable ] +!26 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !27} ; [ DW_TAG_lexical_block ] +!27 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !18} ; [ DW_TAG_lexical_block ] +!28 = metadata !{metadata !"0x100\00b\0021\000", metadata !26, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ] !29 = metadata !{i32 16, i32 0, metadata !30, null} -!30 = metadata !{i32 786443, metadata !31, metadata !8, i32 15, i32 0, i32 0} ; [ DW_TAG_lexical_block ] +!30 = metadata !{metadata !"0xb\0015\000\000", metadata !31, metadata !8} ; [ DW_TAG_lexical_block ] !31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"} !32 = metadata !{i32 0} !33 = metadata !{metadata !1, metadata !8, metadata !18} -!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll index b49aec3..9d65dc1 100644 --- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll +++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll @@ -3,29 +3,29 @@ @.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1] @.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1] @C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0)] -!38 = metadata !{i32 524329, metadata !109} ; [ DW_TAG_file_type ] -!39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ] -!47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!38 = metadata !{metadata !"0x29", metadata !109} ; [ DW_TAG_file_type ] +!39 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !109, metadata !108, metadata !108, null, null, null} ; [ DW_TAG_compile_unit ] +!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !109, null, metadata !47} ; [ DW_TAG_pointer_type ] +!47 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !109, null} ; [ DW_TAG_base_type ] +!97 = metadata !{metadata !"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, metadata !39, metadata !98, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!98 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !109, null, null, metadata !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !99 = metadata !{metadata !100} -!100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] +!100 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !109, null} ; [ DW_TAG_base_type ] !101 = metadata !{[2 x i8*]* @C.9.2167} -!102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ] -!103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ] +!102 = metadata !{metadata !"0x100\00find_strings\0075\000", metadata !103, metadata !38, metadata !104} ; [ DW_TAG_auto_variable ] +!103 = metadata !{metadata !"0xb\0073\000\000", null, metadata !97} ; [ DW_TAG_lexical_block ] +!104 = metadata !{metadata !"0x1\00\000\0085312\0064\000\000", metadata !109, null, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ] !105 = metadata !{metadata !106} -!106 = metadata !{i32 524321, i64 0, i64 1333} ; [ DW_TAG_subrange_type ] +!106 = metadata !{metadata !"0x21\000\001333"} ; [ DW_TAG_subrange_type ] !107 = metadata !{i32 73, i32 0, metadata !103, null} !108 = metadata !{i32 0} !109 = metadata !{metadata !"pbmsrch.c", metadata !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"} define i32 @main() nounwind ssp { bb.nph: - tail call void @llvm.dbg.declare(metadata !101, metadata !102), !dbg !107 + tail call void @llvm.dbg.declare(metadata !101, metadata !102, metadata !{metadata !"0x102"}), !dbg !107 ret i32 0, !dbg !107 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll index 09e34ef..a613939 100644 --- a/test/CodeGen/X86/2010-08-04-StackVariable.ll +++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll @@ -6,8 +6,8 @@ define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp { entry: %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24 - call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24 + call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24 + call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24 %0 = icmp ne i32 %i, 0, !dbg !27 ; <i1> [#uses=1] br i1 %0, label %bb, label %bb1, !dbg !27 @@ -34,7 +34,7 @@ return: ; preds = %bb2 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 { entry: %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34 + call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34 %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1] store i8* null, i8** %0, align 8, !dbg !34 %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1] @@ -45,14 +45,14 @@ return: ; preds = %entry ret void, !dbg !35 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define i32 @main() nounwind ssp { entry: %0 = alloca %struct.SVal ; <%struct.SVal*> [#uses=3] %v = alloca %struct.SVal ; <%struct.SVal*> [#uses=4] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41 + call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41 call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41 %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1] store i32 1, i32* %1, align 8, !dbg !42 @@ -65,65 +65,65 @@ entry: %7 = load i32* %6, align 8, !dbg !43 ; <i32> [#uses=1] store i32 %7, i32* %5, align 8, !dbg !43 %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0] - call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43 + call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43 br label %return, !dbg !45 return: ; preds = %entry ret i32 0, !dbg !45 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!49} !46 = metadata !{metadata !16, metadata !17, metadata !20} -!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ] -!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null, null, metadata !""} ; [ DW_TAG_compile_unit ] +!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ] +!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !46, null, null} ; [ DW_TAG_compile_unit ] !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9} -!5 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ] -!6 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] -!7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] -!8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] -!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ] -!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !47, metadata !1, metadata !6} ; [ DW_TAG_member ] +!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, null} ; [ DW_TAG_pointer_type ] +!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !47, metadata !1, metadata !8} ; [ DW_TAG_member ] +!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !47, metadata !2} ; [ DW_TAG_base_type ] +!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", metadata !47, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !11 = metadata !{null, metadata !12, metadata !13} -!12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ] -!13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ] +!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !47, metadata !2} ; [ DW_TAG_base_type ] +!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !15 = metadata !{null, metadata !12} -!16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ] -!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ] +!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", metadata !47, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ] +!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !19 = metadata !{metadata !13, metadata !13, metadata !1} -!20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ] -!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", metadata !47, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] +!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !22 = metadata !{metadata !13} -!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ] +!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ] !24 = metadata !{i32 16, i32 0, metadata !17, null} -!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, null} ; [ DW_TAG_arg_variable ] -!26 = metadata !{i32 786448, metadata !47, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ] +!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ] +!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_reference_type ] !27 = metadata !{i32 17, i32 0, metadata !28, null} -!28 = metadata !{i32 786443, metadata !47, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ] +!28 = metadata !{metadata !"0xb\0016\000\002", metadata !47, metadata !17} ; [ DW_TAG_lexical_block ] !29 = metadata !{i32 18, i32 0, metadata !28, null} !30 = metadata !{i32 20, i32 0, metadata !28, null} -!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, null} ; [ DW_TAG_arg_variable ] -!32 = metadata !{i32 786470, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ] -!33 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ] +!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ] +!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !33} ; [ DW_TAG_const_type ] +!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ] !34 = metadata !{i32 11, i32 0, metadata !16, null} !35 = metadata !{i32 11, i32 0, metadata !36, null} -!36 = metadata !{i32 786443, metadata !47, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ] -!37 = metadata !{i32 786443, metadata !47, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, null} ; [ DW_TAG_auto_variable ] -!39 = metadata !{i32 786443, metadata !47, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ] -!40 = metadata !{i32 786443, metadata !47, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ] +!36 = metadata !{metadata !"0xb\0011\000\001", metadata !47, metadata !37} ; [ DW_TAG_lexical_block ] +!37 = metadata !{metadata !"0xb\0011\000\000", metadata !47, metadata !16} ; [ DW_TAG_lexical_block ] +!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ] +!39 = metadata !{metadata !"0xb\0023\000\004", metadata !47, metadata !40} ; [ DW_TAG_lexical_block ] +!40 = metadata !{metadata !"0xb\0023\000\003", metadata !47, metadata !20} ; [ DW_TAG_lexical_block ] !41 = metadata !{i32 24, i32 0, metadata !39, null} !42 = metadata !{i32 25, i32 0, metadata !39, null} !43 = metadata !{i32 26, i32 0, metadata !39, null} -!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ] +!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ] !45 = metadata !{i32 27, i32 0, metadata !39, null} !47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"} !48 = metadata !{i32 0} -!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll index a65b632..f52e922 100644 --- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll +++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll @@ -15,21 +15,21 @@ entry: !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!17} -!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", metadata !14, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] +!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ] +!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", metadata !15, metadata !16, metadata !16, metadata !13, null, null} ; [ DW_TAG_compile_unit ] +!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!7 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ] +!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !14, metadata !1} ; [ DW_TAG_base_type ] +!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", metadata !15, metadata !7, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] +!7 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ] !8 = metadata !{i32 53, i32 13, metadata !9, null} -!9 = metadata !{i32 786443, metadata !14, metadata !0, i32 53, i32 11, i32 0} ; [ DW_TAG_lexical_block ] +!9 = metadata !{metadata !"0xb\0053\0011\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ] !10 = metadata !{i32 4, i32 13, metadata !11, null} -!11 = metadata !{i32 786443, metadata !15, metadata !12, i32 4, i32 13, i32 2} ; [ DW_TAG_lexical_block ] -!12 = metadata !{i32 786443, metadata !15, metadata !6, i32 4, i32 11, i32 1} ; [ DW_TAG_lexical_block ] +!11 = metadata !{metadata !"0xb\004\0013\002", metadata !15, metadata !12} ; [ DW_TAG_lexical_block ] +!12 = metadata !{metadata !"0xb\004\0011\001", metadata !15, metadata !6} ; [ DW_TAG_lexical_block ] !13 = metadata !{metadata !0, metadata !6} !14 = metadata !{metadata !"", metadata !"/private/tmp"} !15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"} !16 = metadata !{i32 0} -!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll index 21ac7c9..53fb0af 100644 --- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll +++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll @@ -9,32 +9,32 @@ target triple = "i386-apple-darwin11.0.0" define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp { ; CHECK: TAG_formal_parameter entry: - tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6), !dbg !12 + tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12 ret i32 1, !dbg !13 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!19} -!0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ] -!1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !17, metadata !1, metadata !3, null, i32 (%struct.bar*)* @foo, null, null, metadata !16} ; [ DW_TAG_subprogram ] +!1 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ] +!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, metadata !15, null, null} ; [ DW_TAG_compile_unit ] +!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !17, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] -!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ] +!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !17, metadata !2} ; [ DW_TAG_base_type ] +!6 = metadata !{metadata !"0x101\00i\003\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ] +!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !17, metadata !1, metadata !8} ; [ DW_TAG_pointer_type ] +!8 = metadata !{metadata !"0x13\00bar\002\0064\0032\000\000\000", metadata !17, metadata !1, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ] !9 = metadata !{metadata !10, metadata !11} -!10 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ] -!11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ] +!10 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ] +!11 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ] !12 = metadata !{i32 3, i32 47, metadata !0, null} !13 = metadata !{i32 4, i32 2, metadata !14, null} -!14 = metadata !{i32 786443, metadata !17, metadata !0, i32 3, i32 50, i32 0} ; [ DW_TAG_lexical_block ] +!14 = metadata !{metadata !"0xb\003\0050\000", metadata !17, metadata !0} ; [ DW_TAG_lexical_block ] !15 = metadata !{metadata !0} !16 = metadata !{metadata !6} !17 = metadata !{metadata !"one.c", metadata !"/private/tmp"} !18 = metadata !{i32 0} -!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll index 625a351..ac7fbf2 100644 --- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll +++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll @@ -22,8 +22,8 @@ target triple = "x86_64-apple-darwin10.0.0" define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp { entry: - tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10), !dbg !18 - tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11), !dbg !19 + tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18 + tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !19 br label %while.body, !dbg !20 while.body: ; preds = %while.body, %entry @@ -34,14 +34,14 @@ while.body: ; preds = %while.body, %entry br i1 %cmp, label %if.then, label %while.body, !dbg !23 if.then: ; preds = %while.body - tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12), !dbg !21 + tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !21 ret i64 %b.addr.0, !dbg !23 } define i32 @main() nounwind optsize ssp { entry: %call = tail call i32 @rand() nounwind optsize, !dbg !24 - tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14), !dbg !24 + tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !24 %cmp = icmp ugt i32 %call, 21, !dbg !25 br i1 %cmp, label %cond.true, label %cond.end, !dbg !25 @@ -51,7 +51,7 @@ cond.true: ; preds = %entry cond.end: ; preds = %entry, %cond.true %cond = phi i32 [ %call1, %cond.true ], [ %call, %entry ], !dbg !25 - tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17), !dbg !25 + tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !25 %conv = sext i32 %cond to i64, !dbg !26 %conv5 = zext i32 %call to i64, !dbg !26 %call6 = tail call i64 @gcd(i64 %conv, i64 %conv5) optsize, !dbg !26 @@ -71,36 +71,36 @@ declare i32 @rand() optsize declare i32 @printf(i8* nocapture, ...) nounwind optsize -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone declare i32 @puts(i8* nocapture) nounwind !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!33} -!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd] -!1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null, null, null} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", metadata !31, metadata !1, metadata !3, null, i64 (i64, i64)* @gcd, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd] +!1 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ] +!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", metadata !31, metadata !32, metadata !32, metadata !28, null, null} ; [ DW_TAG_compile_unit ] +!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main] -!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!5 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ] +!6 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", metadata !31, metadata !1, metadata !7, null, i32 ()* @main, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main] +!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !8 = metadata !{metadata !9} -!9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ] -!11 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ] -!12 = metadata !{i32 786688, metadata !13, metadata !"c", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ] -!13 = metadata !{i32 786443, metadata !31, metadata !0, i32 5, i32 52, i32 0} ; [ DW_TAG_lexical_block ] -!14 = metadata !{i32 786688, metadata !15, metadata !"m", metadata !1, i32 26, metadata !16, i32 0, null} ; [ DW_TAG_auto_variable ] -!15 = metadata !{i32 786443, metadata !31, metadata !6, i32 25, i32 12, i32 2} ; [ DW_TAG_lexical_block ] -!16 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] -!17 = metadata !{i32 786688, metadata !15, metadata !"z_s", metadata !1, i32 27, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] +!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ] +!10 = metadata !{metadata !"0x101\00a\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ] +!11 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ] +!12 = metadata !{metadata !"0x100\00c\006\000", metadata !13, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ] +!13 = metadata !{metadata !"0xb\005\0052\000", metadata !31, metadata !0} ; [ DW_TAG_lexical_block ] +!14 = metadata !{metadata !"0x100\00m\0026\000", metadata !15, metadata !1, metadata !16} ; [ DW_TAG_auto_variable ] +!15 = metadata !{metadata !"0xb\0025\0012\002", metadata !31, metadata !6} ; [ DW_TAG_lexical_block ] +!16 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ] +!17 = metadata !{metadata !"0x100\00z_s\0027\000", metadata !15, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ] !18 = metadata !{i32 5, i32 41, metadata !0, null} !19 = metadata !{i32 5, i32 49, metadata !0, null} !20 = metadata !{i32 7, i32 5, metadata !13, null} !21 = metadata !{i32 8, i32 9, metadata !22, null} -!22 = metadata !{i32 786443, metadata !31, metadata !13, i32 7, i32 14, i32 1} ; [ DW_TAG_lexical_block ] +!22 = metadata !{metadata !"0xb\007\0014\001", metadata !31, metadata !13} ; [ DW_TAG_lexical_block ] !23 = metadata !{i32 9, i32 9, metadata !22, null} !24 = metadata !{i32 26, i32 38, metadata !15, null} !25 = metadata !{i32 27, i32 38, metadata !15, null} @@ -111,4 +111,4 @@ declare i32 @puts(i8* nocapture) nounwind !30 = metadata !{metadata !14, metadata !17} !31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"} !32 = metadata !{i32 0} -!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2011-08-29-InitOrder.ll b/test/CodeGen/X86/2011-08-29-InitOrder.ll index a95dcb5..b278ad6 100644 --- a/test/CodeGen/X86/2011-08-29-InitOrder.ll +++ b/test/CodeGen/X86/2011-08-29-InitOrder.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s --check-prefix=CHECK-DEFAULT +; RUN: llc < %s -mtriple=i386-linux-gnu -use-ctors | FileCheck %s --check-prefix=CHECK-DEFAULT ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s --check-prefix=CHECK-DARWIN ; PR5329 diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll index 16706ae..6651af7 100644 --- a/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -8,7 +8,7 @@ ;CHECK: vpxor ;CHECK: vinserti128 ;CHECK: vpshufd -;CHECK: vpshufd +;CHECK: vpbroadcastd ;CHECK: vmulps ;CHECK: vmulps ;CHECK: ret diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll index 1c39c74..519c7ca 100644 --- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll +++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s declare x86_fastcallcc i64 @barrier() diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll index 8d914db..403d21a 100644 --- a/test/CodeGen/X86/2012-10-02-DAGCycle.ll +++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s -; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s +; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s > /dev/null +; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s > /dev/null ; rdar://12393897 diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll index 62ee1e1..1a5efda 100644 --- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll @@ -12,11 +12,11 @@ %struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] } %struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp { entry: - call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4) + call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4, metadata !{metadata !"0x102"}) %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0 %0 = load i16* %type, align 2 %cmp = icmp eq i16 %0, 1 @@ -33,16 +33,20 @@ return: ; preds = %for.cond.preheader, ret i16 %retval.0 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!12} -!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99] +!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !13, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99] !2 = metadata !{} -!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725] -!5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ] -!6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ] -!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ] +!4 = metadata !{metadata !"0x101\00hg\0067109589\000", null, metadata !5, metadata !6} ; [ DW_TAG_arg_variable ] [hg] [line 725] +!5 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ] +!6 = metadata !{metadata !"0x16\00hgstruct\00492\000\000\000\000", metadata !11, null, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ] +!7 = metadata !{metadata !"0x13\00\00487\00512\0064\000\000\000", metadata !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ] !11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"} -!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} +!13 = metadata !{metadata !14} +!14 = metadata !{metadata !"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", metadata !11, metadata !5, metadata !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp] +!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!16 = metadata !{null} diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll index 36667de..083aacd 100644 --- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll @@ -12,7 +12,7 @@ @.str15 = external hidden unnamed_addr constant [6 x i8], align 1 -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp { entry: @@ -43,7 +43,7 @@ if.then3344: br label %if.then4073 if.then4073: ; preds = %if.then3344 - call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4) + call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4, metadata !{metadata !"0x102"}) %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0 %0 = load i32* undef, align 4 %add4093 = add nsw i32 %0, 0 @@ -65,26 +65,31 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!35} -!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99] +!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !19, metadata !2, metadata !2, metadata !20, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99] !1 = metadata !{metadata !2} !2 = metadata !{} -!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815] -!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ] -!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char] -!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] +!4 = metadata !{metadata !"0x100\00num1\00815\000", metadata !5, metadata !14, metadata !15} ; [ DW_TAG_auto_variable ] [num1] [line 815] +!5 = metadata !{metadata !"0xb\00815\000\00177", metadata !14, metadata !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!6 = metadata !{metadata !"0xb\00812\000\00176", metadata !14, metadata !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!7 = metadata !{metadata !"0xb\00807\000\00175", metadata !14, metadata !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!8 = metadata !{metadata !"0xb\00440\000\0094", metadata !14, metadata !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!9 = metadata !{metadata !"0xb\00435\000\0091", metadata !14, metadata !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!10 = metadata !{metadata !"0xb\00434\000\0090", metadata !14, metadata !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!11 = metadata !{metadata !"0xb\00250\000\0024", metadata !14, metadata !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!12 = metadata !{metadata !"0xb\00249\000\0023", metadata !14, metadata !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!13 = metadata !{metadata !"0xb\00221\000\0019", metadata !14, metadata !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!14 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ] +!15 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char] +!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] !17 = metadata !{metadata !18} -!18 = metadata !{i32 786465, i64 0, i64 20} ; [ DW_TAG_subrange_type ] [0, 19] +!18 = metadata !{metadata !"0x21\000\0020"} ; [ DW_TAG_subrange_type ] [0, 19] !19 = metadata !{metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset"} +!20 = metadata !{metadata !21} +!21 = metadata !{metadata !"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley] +!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!23 = metadata !{null} + ; Test DebugValue uses visited by RegisterPressureTracker findUseBetween(). ; ; CHECK: @main @@ -103,7 +108,7 @@ cond.true: ; preds = %entry unreachable cond.end: ; preds = %entry - call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31) + call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31, metadata !{metadata !"0x102"}) %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5 invoke void @_Znwm() to label %exit.i unwind label %lpad2.i.i.i.i @@ -129,9 +134,11 @@ declare void @_Znwm() !llvm.dbg.cu = !{!30} -!30 = metadata !{i32 786449, metadata !34, i32 4, metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus] -!31 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !32, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29] -!32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ] -!33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ] +!30 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", metadata !34, metadata !2, metadata !2, metadata !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus] +!31 = metadata !{metadata !"0x100\00X\0029\000", null, null, metadata !32} ; [ DW_TAG_auto_variable ] [X] [line 29] +!32 = metadata !{metadata !"0x16\00HM\0028\000\000\000\000", metadata !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ] +!33 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ] !34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"} -!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} +!36 = metadata !{metadata !37} +!37 = metadata !{metadata !"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main] diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll index 5aec3d9..458ce4f 100644 --- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll @@ -9,7 +9,7 @@ %struct.btCompoundLeafCallback = type { i32, i32 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define void @test() unnamed_addr uwtable ssp align 2 { entry: @@ -20,7 +20,7 @@ if.then: ; preds = %entry unreachable if.end: ; preds = %entry - call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3) + call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3, metadata !{metadata !"0x102"}) %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1 store i32 0, i32* undef, align 8 %cmp12447 = icmp sgt i32 undef, 0 @@ -36,11 +36,13 @@ invoke.cont44: ; preds = %if.end !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8} -!0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus] -!2 = metadata !{null} -!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214] -!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ] -!5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ] +!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", metadata !6, null, null, metadata !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus] +!1 = metadata !{metadata !2} +!2 = metadata !{metadata !"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", metadata !6, metadata !5, metadata !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test] +!3 = metadata !{metadata !"0x100\00callback\00214\000", null, null, metadata !4} ; [ DW_TAG_auto_variable ] [callback] [line 214] +!4 = metadata !{metadata !"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", metadata !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ] +!5 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ] !6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"} -!7 = metadata !{i32 0} -!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} +!9 = metadata !{null} diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll index bbba796..10dc927 100644 --- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll +++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll @@ -6,7 +6,7 @@ ; we may reference variables that were not live across basic blocks ; resulting in undefined virtual registers. ; -; In this example, this is illustrated by a the spill/reload of the +; In this example, this is illustrated by a spill/reload of the ; LOADED_PTR_SLOT. ; ; Before this patch, the compiler was accessing two different spill diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll new file mode 100644 index 0000000..f65d7c9 --- /dev/null +++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - | llvm-objdump -d -unwind-info -s - | FileCheck %s +; Regression test for http://llvm.org/bugs/show_bug.cgi?id=20800. + +; ModuleID = 'asan_report.ii' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +@.str = private unnamed_addr constant [3 x i8] c"=>\00", align 1 +@.str1 = private unnamed_addr constant [3 x i8] c" \00", align 1 +@.str2 = private unnamed_addr constant [6 x i8] c"%s%p:\00", align 1 + +; CHECK: ___asan_report_error: + +; subq instruction starts at 0x0a, so the second byte of the compact encoding +; (UNWIND_X86_64_FRAMELESS_STACK_SIZE in mach-o/compact_unwind_encoding.h) +; must be 0x0d. +; CHECK: {{a:.*subq.*%rsp}} + +; CHECK: Contents of __compact_unwind section +; CHECK: ___asan_report_error + +; Because of incorrect push instruction size in X86AsmBackend.cpp the stack +; size was also calculated incorrectly. +; CHECK-NOT: {{compact encoding:.*0x0309f800}} +; CHECK: {{compact encoding:.*0x030df800}} + +define void @__asan_report_error() #0 { + %str.i = alloca i64, align 8 + %stack = alloca [256 x i64], align 8 + br label %print_shadow_bytes.exit.i + +print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0 + %iv.i = phi i64 [ -5, %0 ], [ %iv.next.i, %print_shadow_bytes.exit.i ] + %reg15 = icmp eq i64 %iv.i, 0 + %.str..str1.i = select i1 %reg15, [3 x i8]* @.str, [3 x i8]* @.str1 + %reg16 = getelementptr inbounds [3 x i8]* %.str..str1.i, i64 0, i64 0 + %reg17 = shl i64 %iv.i, 1 + %reg19 = inttoptr i64 %reg17 to i8* + call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19) + %iv.next.i = add nsw i64 %iv.i, 0 + br label %print_shadow_bytes.exit.i +} + +declare void @append(i64*, i8*, ...) + +attributes #0 = { "no-frame-pointer-elim"="false" } diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll index 4ce2fb3..54d8f65 100644 --- a/test/CodeGen/X86/MachineSink-DbgValue.ll +++ b/test/CodeGen/X86/MachineSink-DbgValue.ll @@ -4,10 +4,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "x86_64-apple-macosx10.7.0" define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp { - tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6), !dbg !12 + tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12 %ab = load i32* %c, align 1, !dbg !14 - tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7), !dbg !13 - tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10), !dbg !14 + tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !13 + tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !14 %cd = icmp eq i32 %i, 42, !dbg !15 br i1 %cd, label %bb1, label %bb2, !dbg !15 @@ -23,23 +23,23 @@ bb2: ret i32 %.0, !dbg !17 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!22} -!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ] -!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo] -!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null, null} ; [ DW_TAG_compile_unit ] +!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, i32 (i32, i32*)* @foo, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo] +!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ] -!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ] -!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] -!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ] +!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ] +!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ] +!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ] +!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ] +!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ] +!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ] +!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ] !12 = metadata !{i32 2, i32 13, metadata !1, null} !13 = metadata !{i32 2, i32 22, metadata !1, null} !14 = metadata !{i32 3, i32 14, metadata !11, null} @@ -50,4 +50,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone !19 = metadata !{metadata !6, metadata !7, metadata !10} !20 = metadata !{metadata !"a.c", metadata !"/private/tmp"} !21 = metadata !{i32 0} -!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll index 51d0d17..6865873 100644 --- a/test/CodeGen/X86/StackColoring-dbg.ll +++ b/test/CodeGen/X86/StackColoring-dbg.ll @@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define void @foo() nounwind uwtable ssp { entry: @@ -17,7 +17,7 @@ entry: for.body: call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind - call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind + call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22, metadata !{metadata !"0x102"}) nounwind br label %for.body } @@ -27,9 +27,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!23} -!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] +!0 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !1, metadata !2, metadata !2, null, null, null} ; [ DW_TAG_compile_unit ] !1 = metadata !{metadata !"t.c", metadata !""} -!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} +!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] !2 = metadata !{i32 0} -!22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0} -!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!22 = metadata !{metadata !"0x100\00x\0016\000", null, metadata !2, metadata !16} ; [ DW_TAG_auto_variable ] +!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll index 100817a..a435272 100644 --- a/test/CodeGen/X86/SwizzleShuff.ll +++ b/test/CodeGen/X86/SwizzleShuff.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s ; Check that we perform a scalar XOR on i32. diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll new file mode 100644 index 0000000..8c66412 --- /dev/null +++ b/test/CodeGen/X86/TruncAssertZext.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s +; Checks that a zeroing mov is inserted for the trunc/zext pair even when +; the source of the zext is an AssertSext node +; PR20494 + +define i64 @main(i64 %a) { +; CHECK-LABEL: main +; CHECK: movl %e{{..}}, %eax +; CHECK: ret + %or = or i64 %a, -2 + %trunc = trunc i64 %or to i32 + br label %l +l: + %ext = zext i32 %trunc to i64 + ret i64 %ext +} diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll index 1513fcb..9c24be4 100644 --- a/test/CodeGen/X86/add-of-carry.ll +++ b/test/CodeGen/X86/add-of-carry.ll @@ -4,7 +4,7 @@ define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp { entry: ; CHECK-LABEL: test1: -; CHECK: cmpl %ecx, %eax +; CHECK: cmpl %ecx, %eax ; CHECK-NOT: addl ; CHECK: adcl $0, %eax %add4 = add i32 %x, %sum diff --git a/test/CodeGen/X86/add_shl_constant.ll b/test/CodeGen/X86/add_shl_constant.ll new file mode 100644 index 0000000..33074e4 --- /dev/null +++ b/test/CodeGen/X86/add_shl_constant.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin < %s | FileCheck %s + +; CHECK-LABEL: add_shl_add_constant_1_i32 +; CHECK: leal 984(%rsi,%rdi,8), %eax +; CHECK-NEXT: retq +define i32 @add_shl_add_constant_1_i32(i32 %x, i32 %y) nounwind { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %shl, %y + ret i32 %add.1 +} + +; CHECK-LABEL: add_shl_add_constant_2_i32 +; CHECK: leal 984(%rsi,%rdi,8), %eax +; CHECK-NEXT: retq +define i32 @add_shl_add_constant_2_i32(i32 %x, i32 %y) nounwind { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %y, %shl + ret i32 %add.1 +} + +; CHECK: LCPI2_0: +; CHECK: .long 984 +; CHECK: _add_shl_add_constant_1_v4i32 +; CHECK: pslld $3, %[[REG:xmm[0-9]+]] +; CHECK: paddd %xmm1, %[[REG]] +; CHECK: paddd LCPI2_0(%rip), %[[REG:xmm[0-9]+]] +; CHECK: retq +define <4 x i32> @add_shl_add_constant_1_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { + %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123> + %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3> + %add.1 = add <4 x i32> %shl, %y + ret <4 x i32> %add.1 +} + +; CHECK: LCPI3_0: +; CHECK: .long 984 +; CHECK: _add_shl_add_constant_2_v4i32 +; CHECK: pslld $3, %[[REG:xmm[0-9]+]] +; CHECK: paddd %xmm1, %[[REG]] +; CHECK: paddd LCPI3_0(%rip), %[[REG:xmm[0-9]+]] +; CHECK: retq +define <4 x i32> @add_shl_add_constant_2_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { + %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123> + %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3> + %add.1 = add <4 x i32> %y, %shl + ret <4 x i32> %add.1 +} diff --git a/test/CodeGen/X86/addr-mode-matcher.ll b/test/CodeGen/X86/addr-mode-matcher.ll new file mode 100644 index 0000000..d592091 --- /dev/null +++ b/test/CodeGen/X86/addr-mode-matcher.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s | FileCheck %s + +; This testcase used to hit an assert during ISel. For details, see the big +; comment inside the function. + +; CHECK-LABEL: foo: +; The AND should be turned into a subreg access. +; CHECK-NOT: and +; The shift (leal) should be folded into the scale of the address in the load. +; CHECK-NOT: leal +; CHECK: movl {{.*}},4), + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.6.0" + +define void @foo(i32 %a) { +bb: + br label %bb1692 + +bb1692: + %tmp1694 = phi i32 [ 0, %bb ], [ %tmp1745, %bb1692 ] + %xor = xor i32 0, %tmp1694 + +; %load1 = (load (and (shl %xor, 2), 1020)) + %tmp1701 = shl i32 %xor, 2 + %tmp1702 = and i32 %tmp1701, 1020 + %tmp1703 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1702 + %tmp1704 = bitcast i8* %tmp1703 to i32* + %load1 = load i32* %tmp1704, align 4 + +; %load2 = (load (shl (and %xor, 255), 2)) + %tmp1698 = and i32 %xor, 255 + %tmp1706 = shl i32 %tmp1698, 2 + %tmp1707 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1706 + %tmp1708 = bitcast i8* %tmp1707 to i32* + %load2 = load i32* %tmp1708, align 4 + + %tmp1710 = or i32 %load2, %a + +; While matching xor we address-match %load1. The and-of-shift reassocication +; in address matching transform this into into a shift-of-and and the resuting +; node becomes identical to %load2. CSE replaces %load1 which leaves its +; references in MatchScope and RecordedNodes stale. + %tmp1711 = xor i32 %load1, %tmp1710 + + %tmp1744 = getelementptr inbounds [256 x i32]* null, i32 0, i32 %tmp1711 + store i32 0, i32* %tmp1744, align 4 + %tmp1745 = add i32 %tmp1694, 1 + indirectbr i8* undef, [label %bb1756, label %bb1692] + +bb1756: + br label %bb2705 + +bb2705: + indirectbr i8* undef, [label %bb5721, label %bb5736] + +bb5721: + br label %bb2705 + +bb5736: + ret void +} diff --git a/test/CodeGen/X86/address-type-promotion-constantexpr.ll b/test/CodeGen/X86/address-type-promotion-constantexpr.ll new file mode 100644 index 0000000..32f29bd --- /dev/null +++ b/test/CodeGen/X86/address-type-promotion-constantexpr.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux + +; PR20314 is a crashing bug. This program does nothing with the load, so just check that the return is 0. + +@c = common global [2 x i32] zeroinitializer, align 4 +@a = common global i32 0, align 4 +@b = internal unnamed_addr constant [2 x i8] c"\01\00", align 1 + +; CHECK-LABEL: main +; CHECK: xor %eax, %eax +define i32 @main() { +entry: + %foo = load i8* getelementptr ([2 x i8]* @b, i64 0, i64 sext (i8 or (i8 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i8), i8 1) to i64)), align 1 + ret i32 0 +} + diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll new file mode 100644 index 0000000..0498177 --- /dev/null +++ b/test/CodeGen/X86/adx-intrinsics.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding| FileCheck %s --check-prefix=ADX --check-prefix=CHECK + +declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*) + +define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarryx_u32 +; CHECK: addb +; ADX: adcxl +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*) + +define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarryx_u64 +; CHECK: addb +; ADX: adcxq +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*) + +define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarry_u32 +; CHECK: addb +; ADX: adcxl +; NOADX: adcl +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*) + +define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarry_u64 +; CHECK: addb +; ADX: adcxq +; NOADX: adcq +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*) + +define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) { +; CHECK-LABEL: test_subborrow_u32 +; CHECK: addb +; CHECK: sbbl +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*) + +define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) { +; CHECK-LABEL: test_subborrow_u64 +; CHECK: addb +; CHECK: sbbq +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr) + ret i8 %ret; +} + diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll index bf55644..82a8e48 100644 --- a/test/CodeGen/X86/aliases.ll +++ b/test/CodeGen/X86/aliases.ll @@ -30,12 +30,12 @@ define i32 @foo_f() { ret i32 0 } ; CHECK-DAG: .weak bar_f -@bar_f = alias weak %FunTy* @foo_f +@bar_f = weak alias %FunTy* @foo_f -@bar_l = alias linkonce_odr i32* @bar +@bar_l = linkonce_odr alias i32* @bar ; CHECK-DAG: .weak bar_l -@bar_i = alias internal i32* @bar +@bar_i = internal alias i32* @bar ; CHECK-DAG: .globl A @A = alias bitcast (i32* @bar to i64*) diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll new file mode 100644 index 0000000..e2155fe --- /dev/null +++ b/test/CodeGen/X86/aligned-variadic.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=X32 + +%struct.Baz = type { [17 x i8] } +%struct.__va_list_tag = type { i32, i32, i8*, i8* } + +; Function Attrs: nounwind uwtable +define void @bar(%struct.Baz* byval nocapture readnone align 8 %x, ...) { +entry: + %va = alloca [1 x %struct.__va_list_tag], align 16 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %va to i8* + call void @llvm.va_start(i8* %arraydecay1) + %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0, i32 2 + %overflow_arg_area = load i8** %overflow_arg_area_p, align 8 + %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 24 + store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 +; X32: leal 68(%esp), [[REG:%.*]] +; X32: movl [[REG]], 16(%esp) +; X64: leaq 232(%rsp), [[REG:%.*]] +; X64: movq [[REG]], 184(%rsp) +; X64: leaq 176(%rsp), %rdi + call void @qux(%struct.__va_list_tag* %arraydecay) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.va_start(i8*) + +declare void @qux(%struct.__va_list_tag*) diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll index 74b9470..9d8b6cf 100644 --- a/test/CodeGen/X86/alloca-align-rounding.ll +++ b/test/CodeGen/X86/alloca-align-rounding.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI declare void @bar(<2 x i64>* %n) @@ -6,15 +7,29 @@ define void @foo(i64 %h) { %p = alloca <2 x i64>, i64 %h call void @bar(<2 x i64>* %p) ret void -; CHECK: foo +; CHECK-LABEL: foo ; CHECK-NOT: andq $-32, %rax +; X32ABI-LABEL: foo +; X32ABI-NOT: andl $-32, %eax } define void @foo2(i64 %h) { %p = alloca <2 x i64>, i64 %h, align 32 call void @bar(<2 x i64>* %p) ret void -; CHECK: foo2 +; CHECK-LABEL: foo2 ; CHECK: andq $-32, %rsp ; CHECK: andq $-32, %rax +; X32ABI-LABEL: foo2 +; X32ABI: andl $-32, %esp +; X32ABI: andl $-32, %eax +} + +define void @foo3(i64 %h) { + %p = alloca <2 x i64>, i64 %h + ret void +; CHECK-LABEL: foo3 +; CHECK: movq %rbp, %rsp +; X32ABI-LABEL: foo3 +; X32ABI: movl %ebp, %esp } diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll index 6dbfb16..9352438 100644 --- a/test/CodeGen/X86/asm-block-labels.ll +++ b/test/CodeGen/X86/asm-block-labels.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -std-compile-opts | llc -no-integrated-as +; RUN: opt < %s -O3 | llc -no-integrated-as ; ModuleID = 'block12.c' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin8" diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll index 7352d5a..ad1a5c6 100644 --- a/test/CodeGen/X86/atomic-load-store-wide.ll +++ b/test/CodeGen/X86/atomic-load-store-wide.ll @@ -4,16 +4,18 @@ ; FIXME: The generated code can be substantially improved. define void @test1(i64* %ptr, i64 %val1) { -; CHECK: test1 -; CHECK: cmpxchg8b +; CHECK-LABEL: test1 +; CHECK: lock +; CHECK-NEXT: cmpxchg8b ; CHECK-NEXT: jne store atomic i64 %val1, i64* %ptr seq_cst, align 8 ret void } define i64 @test2(i64* %ptr) { -; CHECK: test2 -; CHECK: cmpxchg8b +; CHECK-LABEL: test2 +; CHECK: lock +; CHECK-NEXT: cmpxchg8b %val = load atomic i64* %ptr seq_cst, align 8 ret i64 %val } diff --git a/test/CodeGen/X86/atomic-ops-ancient-64.ll b/test/CodeGen/X86/atomic-ops-ancient-64.ll index 18749b9..508d83b 100644 --- a/test/CodeGen/X86/atomic-ops-ancient-64.ll +++ b/test/CodeGen/X86/atomic-ops-ancient-64.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s +; XFAIL: * define i64 @test_add(i64* %addr, i64 %inc) { ; CHECK-LABEL: test_add: diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll index bdd25e6..f60212d 100644 --- a/test/CodeGen/X86/atomic_add.ll +++ b/test/CodeGen/X86/atomic_add.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC ; rdar://7103704 @@ -14,6 +15,8 @@ define void @inc4(i64* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc4: ; CHECK: incq +; SLOW_INC-LABEL: inc4: +; SLOW_INC-NOT: incq %0 = atomicrmw add i64* %p, i64 1 monotonic ret void } @@ -39,6 +42,8 @@ define void @inc3(i8* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc3: ; CHECK: incb +; SLOW_INC-LABEL: inc3: +; SLOW_INC-NOT: incb %0 = atomicrmw add i8* %p, i8 1 monotonic ret void } @@ -64,6 +69,8 @@ define void @inc2(i16* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc2: ; CHECK: incw +; SLOW_INC-LABEL: inc2: +; SLOW_INC-NOT: incw %0 = atomicrmw add i16* %p, i16 1 monotonic ret void } @@ -89,6 +96,8 @@ define void @inc1(i32* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc1: ; CHECK: incl +; SLOW_INC-LABEL: inc1: +; SLOW_INC-NOT: incl %0 = atomicrmw add i32* %p, i32 1 monotonic ret void } @@ -113,6 +122,8 @@ define void @dec4(i64* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec4: ; CHECK: decq +; SLOW_INC-LABEL: dec4: +; SLOW_INC-NOT: decq %0 = atomicrmw sub i64* %p, i64 1 monotonic ret void } @@ -138,6 +149,8 @@ define void @dec3(i8* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec3: ; CHECK: decb +; SLOW_INC-LABEL: dec3: +; SLOW_INC-NOT: decb %0 = atomicrmw sub i8* %p, i8 1 monotonic ret void } @@ -163,6 +176,8 @@ define void @dec2(i16* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec2: ; CHECK: decw +; SLOW_INC-LABEL: dec2: +; SLOW_INC-NOT: decw %0 = atomicrmw sub i16* %p, i16 1 monotonic ret void } @@ -189,6 +204,8 @@ define void @dec1(i32* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec1: ; CHECK: decl +; SLOW_INC-LABEL: dec1: +; SLOW_INC-NOT: decl %0 = atomicrmw sub i32* %p, i32 1 monotonic ret void } diff --git a/test/CodeGen/X86/atomic_idempotent.ll b/test/CodeGen/X86/atomic_idempotent.ll new file mode 100644 index 0000000..1afc535 --- /dev/null +++ b/test/CodeGen/X86/atomic_idempotent.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32 + +; On x86, an atomic rmw operation that does not modify the value in memory +; (such as atomic add 0) can be replaced by an mfence followed by a mov. +; This is explained (with the motivation for such an optimization) in +; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf + +define i8 @add8(i8* %p) { +; CHECK-LABEL: add8 +; CHECK: mfence +; CHECK: movb + %1 = atomicrmw add i8* %p, i8 0 monotonic + ret i8 %1 +} + +define i16 @or16(i16* %p) { +; CHECK-LABEL: or16 +; CHECK: mfence +; CHECK: movw + %1 = atomicrmw or i16* %p, i16 0 acquire + ret i16 %1 +} + +define i32 @xor32(i32* %p) { +; CHECK-LABEL: xor32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw xor i32* %p, i32 0 release + ret i32 %1 +} + +define i64 @sub64(i64* %p) { +; CHECK-LABEL: sub64 +; X64: mfence +; X64: movq +; X32-NOT: mfence + %1 = atomicrmw sub i64* %p, i64 0 seq_cst + ret i64 %1 +} + +define i128 @or128(i128* %p) { +; CHECK-LABEL: or128 +; CHECK-NOT: mfence + %1 = atomicrmw or i128* %p, i128 0 monotonic + ret i128 %1 +} + +; For 'and', the idempotent value is (-1) +define i32 @and32 (i32* %p) { +; CHECK-LABEL: and32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw and i32* %p, i32 -1 acq_rel + ret i32 %1 +} diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll new file mode 100644 index 0000000..19e019e --- /dev/null +++ b/test/CodeGen/X86/atomic_mi.ll @@ -0,0 +1,525 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64 +; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32 +; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC + +; This file checks that atomic (non-seq_cst) stores of immediate values are +; done in one mov instruction and not 2. More precisely, it makes sure that the +; immediate is not first copied uselessly into a register. + +; Similarily, it checks that a binary operation of an immediate with an atomic +; variable that is stored back in that variable is done as a single instruction. +; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release) +; should be just an add instruction, instead of loading x into a register, doing +; an add and storing the result back. +; The binary operations supported are currently add, and, or, xor. +; sub is not supported because they are translated by an addition of the +; negated immediate. +; Finally, we also check the same kind of pattern for inc/dec + +; seq_cst stores are left as (lock) xchgl, but we try to check every other +; attribute at least once. + +; Please note that these operations do not require the lock prefix: only +; sequentially consistent stores require this kind of protection on X86. +; And even for seq_cst operations, llvm uses the xchg instruction which has +; an implicit lock prefix, so making it explicit is not required. + +define void @store_atomic_imm_8(i8* %p) { +; X64-LABEL: store_atomic_imm_8 +; X64: movb +; X64-NOT: movb +; X32-LABEL: store_atomic_imm_8 +; X32: movb +; X32-NOT: movb + store atomic i8 42, i8* %p release, align 1 + ret void +} + +define void @store_atomic_imm_16(i16* %p) { +; X64-LABEL: store_atomic_imm_16 +; X64: movw +; X64-NOT: movw +; X32-LABEL: store_atomic_imm_16 +; X32: movw +; X32-NOT: movw + store atomic i16 42, i16* %p monotonic, align 2 + ret void +} + +define void @store_atomic_imm_32(i32* %p) { +; X64-LABEL: store_atomic_imm_32 +; X64: movl +; X64-NOT: movl +; On 32 bits, there is an extra movl for each of those functions +; (probably for alignment reasons). +; X32-LABEL: store_atomic_imm_32 +; X32: movl 4(%esp), %eax +; X32: movl +; X32-NOT: movl + store atomic i32 42, i32* %p release, align 4 + ret void +} + +define void @store_atomic_imm_64(i64* %p) { +; X64-LABEL: store_atomic_imm_64 +; X64: movq +; X64-NOT: movq +; These are implemented with a CAS loop on 32 bit architectures, and thus +; cannot be optimized in the same way as the others. +; X32-LABEL: store_atomic_imm_64 +; X32: cmpxchg8b + store atomic i64 42, i64* %p release, align 8 + ret void +} + +; If an immediate is too big to fit in 32 bits, it cannot be store in one mov, +; even on X64, one must use movabsq that can only target a register. +define void @store_atomic_imm_64_big(i64* %p) { +; X64-LABEL: store_atomic_imm_64_big +; X64: movabsq +; X64: movq + store atomic i64 100000000000, i64* %p monotonic, align 8 + ret void +} + +; It would be incorrect to replace a lock xchgl by a movl +define void @store_atomic_imm_32_seq_cst(i32* %p) { +; X64-LABEL: store_atomic_imm_32_seq_cst +; X64: xchgl +; X32-LABEL: store_atomic_imm_32_seq_cst +; X32: xchgl + store atomic i32 42, i32* %p seq_cst, align 4 + ret void +} + +; ----- ADD ----- + +define void @add_8(i8* %p) { +; X64-LABEL: add_8 +; X64-NOT: lock +; X64: addb +; X64-NOT: movb +; X32-LABEL: add_8 +; X32-NOT: lock +; X32: addb +; X32-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = add i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @add_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: add_16 +; X64-NOT: addw +; X32-LABEL: add_16 +; X32-NOT: addw + %1 = load atomic i16* %p acquire, align 2 + %2 = add i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @add_32(i32* %p) { +; X64-LABEL: add_32 +; X64-NOT: lock +; X64: addl +; X64-NOT: movl +; X32-LABEL: add_32 +; X32-NOT: lock +; X32: addl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = add i32 %1, 2 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @add_64(i64* %p) { +; X64-LABEL: add_64 +; X64-NOT: lock +; X64: addq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'addq'. +; X32-LABEL: add_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = add i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @add_32_seq_cst(i32* %p) { +; X64-LABEL: add_32_seq_cst +; X64: xchgl +; X32-LABEL: add_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = add i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- AND ----- + +define void @and_8(i8* %p) { +; X64-LABEL: and_8 +; X64-NOT: lock +; X64: andb +; X64-NOT: movb +; X32-LABEL: and_8 +; X32-NOT: lock +; X32: andb +; X32-NOT: movb + %1 = load atomic i8* %p monotonic, align 1 + %2 = and i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @and_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: and_16 +; X64-NOT: andw +; X32-LABEL: and_16 +; X32-NOT: andw + %1 = load atomic i16* %p acquire, align 2 + %2 = and i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @and_32(i32* %p) { +; X64-LABEL: and_32 +; X64-NOT: lock +; X64: andl +; X64-NOT: movl +; X32-LABEL: and_32 +; X32-NOT: lock +; X32: andl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = and i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @and_64(i64* %p) { +; X64-LABEL: and_64 +; X64-NOT: lock +; X64: andq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'andq'. +; X32-LABEL: and_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = and i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @and_32_seq_cst(i32* %p) { +; X64-LABEL: and_32_seq_cst +; X64: xchgl +; X32-LABEL: and_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = and i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- OR ----- + +define void @or_8(i8* %p) { +; X64-LABEL: or_8 +; X64-NOT: lock +; X64: orb +; X64-NOT: movb +; X32-LABEL: or_8 +; X32-NOT: lock +; X32: orb +; X32-NOT: movb + %1 = load atomic i8* %p acquire, align 1 + %2 = or i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @or_16(i16* %p) { +; X64-LABEL: or_16 +; X64-NOT: orw +; X32-LABEL: or_16 +; X32-NOT: orw + %1 = load atomic i16* %p acquire, align 2 + %2 = or i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @or_32(i32* %p) { +; X64-LABEL: or_32 +; X64-NOT: lock +; X64: orl +; X64-NOT: movl +; X32-LABEL: or_32 +; X32-NOT: lock +; X32: orl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = or i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @or_64(i64* %p) { +; X64-LABEL: or_64 +; X64-NOT: lock +; X64: orq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'orq'. +; X32-LABEL: or_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = or i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @or_32_seq_cst(i32* %p) { +; X64-LABEL: or_32_seq_cst +; X64: xchgl +; X32-LABEL: or_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = or i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- XOR ----- + +define void @xor_8(i8* %p) { +; X64-LABEL: xor_8 +; X64-NOT: lock +; X64: xorb +; X64-NOT: movb +; X32-LABEL: xor_8 +; X32-NOT: lock +; X32: xorb +; X32-NOT: movb + %1 = load atomic i8* %p acquire, align 1 + %2 = xor i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @xor_16(i16* %p) { +; X64-LABEL: xor_16 +; X64-NOT: xorw +; X32-LABEL: xor_16 +; X32-NOT: xorw + %1 = load atomic i16* %p acquire, align 2 + %2 = xor i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @xor_32(i32* %p) { +; X64-LABEL: xor_32 +; X64-NOT: lock +; X64: xorl +; X64-NOT: movl +; X32-LABEL: xor_32 +; X32-NOT: lock +; X32: xorl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = xor i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @xor_64(i64* %p) { +; X64-LABEL: xor_64 +; X64-NOT: lock +; X64: xorq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'xorq'. +; X32-LABEL: xor_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = xor i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @xor_32_seq_cst(i32* %p) { +; X64-LABEL: xor_32_seq_cst +; X64: xchgl +; X32-LABEL: xor_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = xor i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- INC ----- + +define void @inc_8(i8* %p) { +; X64-LABEL: inc_8 +; X64-NOT: lock +; X64: incb +; X64-NOT: movb +; X32-LABEL: inc_8 +; X32-NOT: lock +; X32: incb +; X32-NOT: movb +; SLOW_INC-LABEL: inc_8 +; SLOW_INC-NOT: incb +; SLOW_INC-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = add i8 %1, 1 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @inc_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: inc_16 +; X64-NOT: incw +; X32-LABEL: inc_16 +; X32-NOT: incw +; SLOW_INC-LABEL: inc_16 +; SLOW_INC-NOT: incw + %1 = load atomic i16* %p acquire, align 2 + %2 = add i16 %1, 1 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @inc_32(i32* %p) { +; X64-LABEL: inc_32 +; X64-NOT: lock +; X64: incl +; X64-NOT: movl +; X32-LABEL: inc_32 +; X32-NOT: lock +; X32: incl +; X32-NOT: movl +; SLOW_INC-LABEL: inc_32 +; SLOW_INC-NOT: incl +; SLOW_INC-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = add i32 %1, 1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @inc_64(i64* %p) { +; X64-LABEL: inc_64 +; X64-NOT: lock +; X64: incq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'incq'. +; X32-LABEL: inc_64 +; SLOW_INC-LABEL: inc_64 +; SLOW_INC-NOT: incq +; SLOW_INC-NOT: movq + %1 = load atomic i64* %p acquire, align 8 + %2 = add i64 %1, 1 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @inc_32_seq_cst(i32* %p) { +; X64-LABEL: inc_32_seq_cst +; X64: xchgl +; X32-LABEL: inc_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = add i32 %1, 1 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- DEC ----- + +define void @dec_8(i8* %p) { +; X64-LABEL: dec_8 +; X64-NOT: lock +; X64: decb +; X64-NOT: movb +; X32-LABEL: dec_8 +; X32-NOT: lock +; X32: decb +; X32-NOT: movb +; SLOW_INC-LABEL: dec_8 +; SLOW_INC-NOT: decb +; SLOW_INC-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = sub i8 %1, 1 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @dec_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: dec_16 +; X64-NOT: decw +; X32-LABEL: dec_16 +; X32-NOT: decw +; SLOW_INC-LABEL: dec_16 +; SLOW_INC-NOT: decw + %1 = load atomic i16* %p acquire, align 2 + %2 = sub i16 %1, 1 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @dec_32(i32* %p) { +; X64-LABEL: dec_32 +; X64-NOT: lock +; X64: decl +; X64-NOT: movl +; X32-LABEL: dec_32 +; X32-NOT: lock +; X32: decl +; X32-NOT: movl +; SLOW_INC-LABEL: dec_32 +; SLOW_INC-NOT: decl +; SLOW_INC-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = sub i32 %1, 1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @dec_64(i64* %p) { +; X64-LABEL: dec_64 +; X64-NOT: lock +; X64: decq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'decq'. +; X32-LABEL: dec_64 +; SLOW_INC-LABEL: dec_64 +; SLOW_INC-NOT: decq +; SLOW_INC-NOT: movq + %1 = load atomic i64* %p acquire, align 8 + %2 = sub i64 %1, 1 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @dec_32_seq_cst(i32* %p) { +; X64-LABEL: dec_32_seq_cst +; X64: xchgl +; X32-LABEL: dec_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = sub i32 %1, 1 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index 1fd9085..02ea173 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -51,46 +51,6 @@ entry: ret <4 x i64> %shuffle } -;;; -;;; Check that some 256-bit vectors are xformed into 128 ops -; CHECK: _A -; CHECK: vshufpd $1 -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: vshufpd $1 -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> - ret <4 x i64> %shuffle -} - -; CHECK: _B -; CHECK: vshufpd $1, %ymm -define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6> - ret <4 x i64> %shuffle -} - -; CHECK: movlhps -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: movlhps -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6> - ret <4 x i64> %shuffle -} - -; CHECK: vpshufd $-96 -; CHECK: vpshufd $-6 -; CHECK: vinsertf128 $1 -define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11> - ret <8 x i32> %shuffle -} - ;;; Don't crash on movd ; CHECK: _VMOVZQI2PQI ; CHECK: vmovd (% diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll deleted file mode 100644 index d2a22d7..0000000 --- a/test/CodeGen/X86/avx-blend.ll +++ /dev/null @@ -1,202 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; AVX128 tests: - -;CHECK-LABEL: vsel_float: -; select mask is <i1 true, i1 false, i1 true, i1 false>. -; Big endian representation is 0101 = 5. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; the inverted mask: 1010 = 10. -; According to the ABI: -; v1 is in xmm0 => first argument is xmm0. -; v2 is in xmm1 => second argument is xmm1. -; result is in xmm0 => destination argument. -;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0 -;CHECK: ret -define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2 - ret <4 x float> %vsel -} - - -;CHECK-LABEL: vsel_i32: -;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0 -;CHECK: ret -define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2 - ret <4 x i32> %vsel -} - - -;CHECK-LABEL: vsel_double: -;CHECK: vmovsd -;CHECK: ret -define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2 - ret <2 x double> %vsel -} - - -;CHECK-LABEL: vsel_i64: -;CHECK: vmovsd -;CHECK: ret -define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2 - ret <2 x i64> %vsel -} - - -;CHECK-LABEL: vsel_i8: -;CHECK: vpblendvb -;CHECK: ret -define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { - %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2 - ret <16 x i8> %vsel -} - - -; AVX256 tests: - - -;CHECK-LABEL: vsel_float8: -;CHECK-NOT: vinsertf128 -; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false> -; which translates into the boolean mask (big endian representation): -; 00010001 = 17. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; the inverted mask: 11101110 = 238. -;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0 -;CHECK: ret -define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2 - ret <8 x float> %vsel -} - -;CHECK-LABEL: vsel_i328: -;CHECK-NOT: vinsertf128 -;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0 -;CHECK-NEXT: ret -define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2 - ret <8 x i32> %vsel -} - -;CHECK-LABEL: vsel_double8: -; select mask is 2x: 0001 => intel mask: ~0001 = 14 -; ABI: -; v1 is in ymm0 and ymm1. -; v2 is in ymm2 and ymm3. -; result is in ymm0 and ymm1. -; Compute the low part: res.low = blend v1.low, v2.low, blendmask -;CHECK: vblendpd $14, %ymm2, %ymm0, %ymm0 -; Compute the high part. -;CHECK: vblendpd $14, %ymm3, %ymm1, %ymm1 -;CHECK: ret -define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2 - ret <8 x double> %vsel -} - -;CHECK-LABEL: vsel_i648: -;CHECK: vblendpd $14, %ymm2, %ymm0, %ymm0 -;CHECK: vblendpd $14, %ymm3, %ymm1, %ymm1 -;CHECK: ret -define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2 - ret <8 x i64> %vsel -} - -;CHECK-LABEL: vsel_double4: -;CHECK-NOT: vinsertf128 -;CHECK: vblendpd $10 -;CHECK-NEXT: ret -define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2 - ret <4 x double> %vsel -} - -;; TEST blend + compares -; CHECK: testa -define <2 x double> @testa(<2 x double> %x, <2 x double> %y) { - ; CHECK: vcmplepd - ; CHECK: vblendvpd - %max_is_x = fcmp oge <2 x double> %x, %y - %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %max -} - -; CHECK: testb -define <2 x double> @testb(<2 x double> %x, <2 x double> %y) { - ; CHECK: vcmpnlepd - ; CHECK: vblendvpd - %min_is_x = fcmp ult <2 x double> %x, %y - %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %min -} - -; If we can figure out a blend has a constant mask, we should emit the -; blend instruction with an immediate mask -define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { -; CHECK-LABEL: constant_blendvpd_avx: -; CHECK-NOT: mov -; CHECK: vblendpd -; CHECK: ret - %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab - ret <4 x double> %1 -} - -define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { -; CHECK-LABEL: constant_blendvps_avx: -; CHECK-NOT: mov -; CHECK: vblendps -; CHECK: ret - %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd - ret <8 x float> %1 -} - -declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) -declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) - -;; 4 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_shufflevector_4xfloat -define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) { -; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>. -; Big endian representation is 0101 = 5. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; Inverted mask: 1010 = 10. -; According to the ABI: -; a is in xmm0 => first argument is xmm0. -; b is in xmm1 => second argument is xmm1. -; Result is in xmm0 => destination argument. -; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0 -; CHECK: ret - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %1 -} - -; CHECK-LABEL: @blend_shufflevector_8xfloat -define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) { -; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0 -; CHECK: ret - %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15> - ret <8 x float> %1 -} - -; CHECK-LABEL: @blend_shufflevector_4xdouble -define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) { -; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0 -; CHECK: ret - %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x double> %1 -} - -; CHECK-LABEL: @blend_shufflevector_4xi64 -define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) { -; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0 -; CHECK: ret - %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - ret <4 x i64> %1 -} diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll index 3e051bf..70ec124 100644 --- a/test/CodeGen/X86/avx-intel-ocl.ll +++ b/test/CodeGen/X86/avx-intel-ocl.ll @@ -89,23 +89,23 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload ; X64-LABEL: test_prolog_epilog -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill ; X64: call -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) ret <16 x float> %c diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll new file mode 100644 index 0000000..d2b44cd --- /dev/null +++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s + +define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { + ; CHECK: vblendpd + %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone + + +define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { + ; CHECK: vblendps + %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone + + +define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { + ; CHECK: vdpps + %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone + + diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index ce31161..ef3e83f 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -455,21 +455,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: vpslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: vpslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} +
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone @@ -551,21 +551,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} +
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone @@ -818,18 +818,18 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: vblendpd - %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vblendps - %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { @@ -850,35 +850,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: vdppd - %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vdpps - %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vinsertps - %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: vmpsadbw - %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) { @@ -899,10 +899,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK: vpblendw - %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) { @@ -1770,18 +1770,18 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vblendpd - %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] + %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] ret <4 x double> %res } -declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { ; CHECK: vblendps - %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } -declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { @@ -1950,10 +1950,10 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { ; CHECK: vdpps - %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } -declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) { @@ -2309,7 +2309,7 @@ declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) noun define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) { ; CHECK: vpermilpd - %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1] ret <2 x double> %res } declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone @@ -2324,7 +2324,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) { - ; CHECK: vpshufd + ; CHECK: vpermilps %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/test/CodeGen/X86/avx-movdup.ll b/test/CodeGen/X86/avx-movdup.ll deleted file mode 100644 index 42d84de..0000000 --- a/test/CodeGen/X86/avx-movdup.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vmovsldup -define <8 x float> @movdupA(<8 x float> %src) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> - ret <8 x float> %shuffle.i -} - -; CHECK: vmovshdup -define <8 x float> @movdupB(<8 x float> %src) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> - ret <8 x float> %shuffle.i -} - -; CHECK: vmovsldup -define <4 x i64> @movdupC(<4 x i64> %src) nounwind uwtable readnone ssp { -entry: - %0 = bitcast <4 x i64> %src to <8 x float> - %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> - %1 = bitcast <8 x float> %shuffle.i to <4 x i64> - ret <4 x i64> %1 -} - -; CHECK: vmovshdup -define <4 x i64> @movdupD(<4 x i64> %src) nounwind uwtable readnone ssp { -entry: - %0 = bitcast <4 x i64> %src to <8 x float> - %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> - %1 = bitcast <8 x float> %shuffle.i to <4 x i64> - ret <4 x i64> %1 -} - diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll deleted file mode 100644 index fb2287f..0000000 --- a/test/CodeGen/X86/avx-sext.ll +++ /dev/null @@ -1,199 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2 - -define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; AVX: sext_8i16_to_8i32 -; AVX: vpmovsxwd - - %B = sext <8 x i16> %A to <8 x i32> - ret <8 x i32>%B -} - -define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; AVX: sext_4i32_to_4i64 -; AVX: vpmovsxdq - - %B = sext <4 x i32> %A to <4 x i64> - ret <4 x i64>%B -} - -; AVX: load_sext_test1 -; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test1 -; SSSE3: movq -; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}} -; SSSE3: psrad $16 -; SSSE3: ret - -; SSE2: load_sext_test1 -; SSE2: movq -; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}} -; SSE2: psrad $16 -; SSE2: ret -define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { - %X = load <4 x i16>* %ptr - %Y = sext <4 x i16> %X to <4 x i32> - ret <4 x i32>%Y -} - -; AVX: load_sext_test2 -; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test2 -; SSSE3: movd -; SSSE3: pshufb -; SSSE3: psrad $24 -; SSSE3: ret - -; SSE2: load_sext_test2 -; SSE2: movl -; SSE2: psrad $24 -; SSE2: ret -define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { - %X = load <4 x i8>* %ptr - %Y = sext <4 x i8> %X to <4 x i32> - ret <4 x i32>%Y -} - -; AVX: load_sext_test3 -; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test3 -; SSSE3: movsbq -; SSSE3: movsbq -; SSSE3: punpcklqdq -; SSSE3: ret - -; SSE2: load_sext_test3 -; SSE2: movsbq -; SSE2: movsbq -; SSE2: punpcklqdq -; SSE2: ret -define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { - %X = load <2 x i8>* %ptr - %Y = sext <2 x i8> %X to <2 x i64> - ret <2 x i64>%Y -} - -; AVX: load_sext_test4 -; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test4 -; SSSE3: movswq -; SSSE3: movswq -; SSSE3: punpcklqdq -; SSSE3: ret - -; SSE2: load_sext_test4 -; SSE2: movswq -; SSE2: movswq -; SSE2: punpcklqdq -; SSE2: ret -define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { - %X = load <2 x i16>* %ptr - %Y = sext <2 x i16> %X to <2 x i64> - ret <2 x i64>%Y -} - -; AVX: load_sext_test5 -; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test5 -; SSSE3: movslq -; SSSE3: movslq -; SSSE3: punpcklqdq -; SSSE3: ret - -; SSE2: load_sext_test5 -; SSE2: movslq -; SSE2: movslq -; SSE2: punpcklqdq -; SSE2: ret -define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { - %X = load <2 x i32>* %ptr - %Y = sext <2 x i32> %X to <2 x i64> - ret <2 x i64>%Y -} - -; AVX: load_sext_test6 -; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test6 -; SSSE3: movq -; SSSE3: punpcklbw -; SSSE3: psraw $8 -; SSSE3: ret - -; SSE2: load_sext_test6 -; SSE2: movq -; SSE2: punpcklbw -; SSE2: psraw $8 -; SSE2: ret -define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { - %X = load <8 x i8>* %ptr - %Y = sext <8 x i8> %X to <8 x i16> - ret <8 x i16>%Y -} - -; AVX: sext_4i1_to_4i64 -; AVX: vpslld $31 -; AVX: vpsrad $31 -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { - %extmask = sext <4 x i1> %mask to <4 x i64> - ret <4 x i64> %extmask -} - -; AVX-LABEL: sext_16i8_to_16i16 -; AVX: vpmovsxbw -; AVX: vmovhlps -; AVX: vpmovsxbw -; AVX: ret -define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { - %X = load <16 x i8>* %ptr - %Y = sext <16 x i8> %X to <16 x i16> - ret <16 x i16> %Y -} - -; AVX: sext_4i8_to_4i64 -; AVX: vpslld $24 -; AVX: vpsrad $24 -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { - %extmask = sext <4 x i8> %mask to <4 x i64> - ret <4 x i64> %extmask -} - -; AVX: sext_4i8_to_4i64 -; AVX: vpmovsxbd -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { - %X = load <4 x i8>* %ptr - %Y = sext <4 x i8> %X to <4 x i64> - ret <4 x i64>%Y -} - -; AVX: sext_4i16_to_4i64 -; AVX: vpmovsxwd -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { - %X = load <4 x i16>* %ptr - %Y = sext <4 x i16> %X to <4 x i64> - ret <4 x i64>%Y -} diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll deleted file mode 100644 index 4a996d7..0000000 --- a/test/CodeGen/X86/avx-shuffle.ll +++ /dev/null @@ -1,336 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; PR11102 -define <4 x float> @test1(<4 x float> %a) nounwind { - %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef> - ret <4 x float> %b -; CHECK-LABEL: test1: -;; TODO: This test could be improved by removing the xor instruction and -;; having vinsertps zero out the needed elements. -; CHECK: vxorps -; CHECK: vinsertps -} - -; rdar://10538417 -define <3 x i64> @test2(<2 x i64> %v) nounwind readnone { -; CHECK-LABEL: test2: -; CHECK: vinsertf128 - %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef> - %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2> - ret <3 x i64> %2 -; CHECK: ret -} - -define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind { - %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef> - ret <4 x i64> %c -; CHECK-LABEL: test3: -; CHECK: vblendpd -; CHECK: ret -} - -define <8 x float> @test4(float %a) nounwind { - %b = insertelement <8 x float> zeroinitializer, float %a, i32 0 - ret <8 x float> %b -; CHECK-LABEL: test4: -; CHECK: vinsertf128 -} - -; rdar://10594409 -define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp { -entry: - %0 = bitcast float* %f to <4 x float>* - %1 = load <4 x float>* %0, align 16 -; CHECK: test5 -; CHECK: vmovaps -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> - ret <8 x float> %shuffle.i -} - -define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp { -entry: - %0 = bitcast double* %d to <2 x double>* - %1 = load <2 x double>* %0, align 16 -; CHECK: test6 -; CHECK: vmovaps -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> - ret <4 x double> %shuffle.i -} - -define <16 x i16> @test7(<4 x i16> %a) nounwind { -; CHECK: test7 - %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK: ret - ret <16 x i16> %b -} - -; CHECK: test8 -define void @test8() { -entry: - %0 = load <16 x i64> addrspace(1)* null, align 128 - %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26> - %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15> - store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128 -; CHECK: ret - ret void -} - -; Extract a value from a shufflevector.. -define i32 @test9(<4 x i32> %a) nounwind { -; CHECK: test9 -; CHECK: vpextrd - %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4> - %r = extractelement <8 x i32> %b, i32 2 -; CHECK: ret - ret i32 %r -} - -; Extract a value which is the result of an undef mask. -define i32 @test10(<4 x i32> %a) nounwind { -; CHECK: @test10 -; CHECK-NOT: {{^[^#]*[a-z]}} -; CHECK: ret - %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r = extractelement <8 x i32> %b, i32 2 - ret i32 %r -} - -define <4 x float> @test11(<4 x float> %a) nounwind { -; CHECK: test11 -; CHECK: vpshufd $27 - %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x float> %tmp1 -} - -define <4 x float> @test12(<4 x float>* %a) nounwind { -; CHECK: test12 -; CHECK: vpshufd - %tmp0 = load <4 x float>* %a - %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x float> %tmp1 -} - -define <4 x i32> @test13(<4 x i32> %a) nounwind { -; CHECK: test13 -; CHECK: vpshufd $27 - %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x i32> %tmp1 -} - -define <4 x i32> @test14(<4 x i32>* %a) nounwind { -; CHECK: test14 -; CHECK: vpshufd $27, ( - %tmp0 = load <4 x i32>* %a - %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x i32> %tmp1 -} - -; CHECK: test15 -; CHECK: vpshufd $8 -; CHECK: ret -define <4 x i32> @test15(<2 x i32>%x) nounwind readnone { - %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - ret <4 x i32>%x1 -} - -; rdar://10974078 -define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp { -entry: - %0 = bitcast float* %f to <4 x float>* - %1 = load <4 x float>* %0, align 8 -; CHECK: test16 -; CHECK: vmovups -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> - ret <8 x float> %shuffle.i -} - -; PR12413 -; CHECK: shuf1 -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpshufb -define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) { -entry: - %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> - ret <32 x i8> %0 -} - -; handle the case where only half of the 256-bits is splittable -; CHECK: shuf2 -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpextrb -; CHECK: vpextrb -define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) { -entry: - %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> - ret <32 x i8> %0 -} - -; CHECK: blend1 -; CHECK: vblendps -; CHECK: ret -define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i32> %t -} - -; CHECK: blend2 -; CHECK: vblendps -; CHECK: ret -define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %t -} - -; CHECK: blend2a -; CHECK: vblendps -; CHECK: ret -define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline { - %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %t -} - -; CHECK: blend3 -; CHECK-NOT: vblendps -; CHECK: ret -define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7> - ret <4 x i32> %t -} - -; CHECK: blend4 -; CHECK: vblendpd -; CHECK: ret -define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { - %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i64> %t -} - -; CHECK: narrow -; CHECK: vpermilps -; CHECK: ret -define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef> - ret <16 x i16> %t -} - -;CHECK-LABEL: test17: -;CHECK-NOT: vinsertf128 -;CHECK: ret -define <8 x float> @test17(<4 x float> %y) { - %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x float> %x -} - -; CHECK: test18 -; CHECK: vmovshdup -; CHECK: vblendps -; CHECK: ret -define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind { - %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> - ret <8 x float>%S -} - -; CHECK: test19 -; CHECK: vmovsldup -; CHECK: vblendps -; CHECK: ret -define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind { - %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> - ret <8 x float>%S -} - -; rdar://12684358 -; Make sure loads happen before stores. -; CHECK: swap8doubles -; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} -; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} -; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}} -; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}} -; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} -; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} -; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) -; CHECK: vextractf128 -; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) -; CHECK: vextractf128 -; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) -; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) -define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp { -entry: - %add.ptr = getelementptr inbounds double* %A, i64 2 - %v.i = bitcast double* %A to <2 x double>* - %0 = load <2 x double>* %v.i, align 1 - %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> - %v1.i = bitcast double* %add.ptr to <2 x double>* - %1 = load <2 x double>* %v1.i, align 1 - %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind - %add.ptr1 = getelementptr inbounds double* %A, i64 6 - %add.ptr2 = getelementptr inbounds double* %A, i64 4 - %v.i27 = bitcast double* %add.ptr2 to <2 x double>* - %3 = load <2 x double>* %v.i27, align 1 - %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> - %v1.i29 = bitcast double* %add.ptr1 to <2 x double>* - %4 = load <2 x double>* %v1.i29, align 1 - %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind - %6 = bitcast double* %C to <4 x double>* - %7 = load <4 x double>* %6, align 32 - %add.ptr5 = getelementptr inbounds double* %C, i64 4 - %8 = bitcast double* %add.ptr5 to <4 x double>* - %9 = load <4 x double>* %8, align 32 - %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1> - %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1) - %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1> - %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1) - store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16 - store <2 x double> %10, <2 x double>* %v1.i, align 16 - store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16 - store <2 x double> %11, <2 x double>* %v1.i29, align 16 - store <4 x double> %2, <4 x double>* %6, align 32 - store <4 x double> %5, <4 x double>* %8, align 32 - ret void -} -declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone -declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - -; this test case just should not fail -define void @test20() { - %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2 - store <3 x double> %a0, <3 x double>* undef, align 1 - %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2 - store <3 x double> %a1, <3 x double>* undef, align 1 - ret void -} - -define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { -; CHECK-LABEL: test_insert_64_zext -; CHECK-NOT: xor -; CHECK: vmovq - %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> - ret <2 x i64> %1 -} - -;; Ensure we don't use insertps from non v4x32 vectors. -;; On SSE4.1 it works because bigger vectors use more than 1 register. -;; On AVX they get passed in a single register. -;; FIXME: We could probably optimize this case, if we're only using the -;; first 4 indices. -define <4 x i32> @insert_from_diff_size(<8 x i32> %x) { -; CHECK-LABEL: insert_from_diff_size: -; CHECK-NOT: insertps -; CHECK: ret - %vecext = extractelement <8 x i32> %x, i32 0 - %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 - %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 - %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 - %a.0 = extractelement <8 x i32> %x, i32 0 - %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3 - ret <4 x i32> %vecinit3 -} diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index b1b2f8b..98c1645 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -1,9 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; CHECK: vpunpcklbw %xmm -; CHECK-NEXT: vpunpckhbw %xmm -; CHECK-NEXT: vpshufd $85 +; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; CHECK-NEXT: vinsertf128 $1 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { entry: @@ -11,8 +9,7 @@ entry: ret <32 x i8> %shuffle } -; CHECK: vpunpckhwd %xmm -; CHECK-NEXT: vpshufd $85 +; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11] ; CHECK-NEXT: vinsertf128 $1 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { entry: @@ -21,7 +18,7 @@ entry: } ; CHECK: vmovq -; CHECK-NEXT: vmovlhps %xmm +; CHECK-NEXT: vunpcklpd %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: @@ -32,7 +29,7 @@ entry: ret <4 x i64> %vecinit6.i } -; CHECK: vpermilpd $0 +; CHECK: vunpcklpd %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { entry: @@ -72,7 +69,7 @@ __load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_ex ret <8 x float> %load_broadcast12281250 } -; CHECK: vpshufd $0 +; CHECK: vpermilps $4 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcF(i32 %val) nounwind { %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6 @@ -81,7 +78,7 @@ define <8 x float> @funcF(i32 %val) nounwind { ret <8 x float> %tmp } -; CHECK: vpshufd $0 +; CHECK: vpermilps $0 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp { entry: @@ -90,7 +87,7 @@ entry: } ; CHECK: vextractf128 $1 -; CHECK-NEXT: vpshufd +; CHECK-NEXT: vpermilps $85 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll deleted file mode 100644 index 1c56fe2..0000000 --- a/test/CodeGen/X86/avx-vmovddup.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vmovddup %ymm -define <4 x i64> @A(<4 x i64> %a) { - %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> - ret <4 x i64> %c -} - -; CHECK: vmovddup (% -define <4 x i64> @B(<4 x i64>* %ptr) { - %a = load <4 x i64>* %ptr - %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> - ret <4 x i64> %c -} diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll deleted file mode 100644 index c20775b..0000000 --- a/test/CodeGen/X86/avx-vperm2f128.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: _A -; CHECK: vperm2f128 $1 -define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> - ret <8 x float> %shuffle -} - -; CHECK: _B -; CHECK: vblendps $240 -define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> - ret <8 x float> %shuffle -} - -; CHECK: _C -; CHECK: vperm2f128 $0 -define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> - ret <8 x float> %shuffle -} - -; CHECK: _D -; CHECK: vperm2f128 $17 -define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> - ret <8 x float> %shuffle -} - -; CHECK: _E -; CHECK: vperm2f128 $17 -define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> - ret <32 x i8> %shuffle -} - -; CHECK: _E2 -; CHECK: vperm2f128 $3 -define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i64> %shuffle -} - -;;;; Cases with undef indicies mixed in the mask - -; CHECK: _F -; CHECK: vperm2f128 $33 -define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> - ret <8 x float> %shuffle -} - -;;;; Cases we must not select vperm2f128 - -; CHECK: _G -; CHECK-NOT: vperm2f128 -define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> - ret <8 x float> %shuffle -} diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll new file mode 100644 index 0000000..a103405 --- /dev/null +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -0,0 +1,202 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: A: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> + ret <8 x float> %shuffle +} + +define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: B: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: C: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x float> %shuffle +} + +define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: D: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: E: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <32 x i8> %shuffle +} + +define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: E2: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: Ei: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: Ei: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <32 x i8> %shuffle +} + +define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E2i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E2i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> + %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E3i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E3i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> + ret <8 x i32> %shuffle +} + +define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E4i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: E4i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <16 x i16> %shuffle +} + +define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E5i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: E5i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +entry: + %c = load <16 x i16>* %a + %d = load <16 x i16>* %b + %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <16 x i16> %shuffle +} + +;;;; Cases with undef indicies mixed in the mask + +define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: F: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> + ret <8 x float> %shuffle +} + +;;;; Cases we must not select vperm2f128 + +define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: G: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: G: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX2-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> + ret <8 x float> %shuffle +} diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll deleted file mode 100644 index b7f8d72..0000000 --- a/test/CodeGen/X86/avx-vpermil.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vpermilps -define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5> - ret <8 x float> %shuffle -} - -; CHECK: vpermilpd -define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> - ret <4 x double> %shuffle -} - -; CHECK: vpermilps -define <8 x i32> @funcC(<8 x i32> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5> - ret <8 x i32> %shuffle -} - -; CHECK: vpermilpd -define <4 x i64> @funcD(<4 x i64> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> - ret <4 x i64> %shuffle -} - -; CHECK: vpermilpd -define <4 x i64> @funcQ(<4 x i64>* %a) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i64>* %a - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> - ret <4 x i64> %shuffle -} - -; vpermil should match masks like this: <u,3,1,2,4,u,5,6>. Check that the -; target specific mask was correctly generated. -; CHECK: vpermilps $-100 -define <8 x float> @funcE(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 3, i32 1, i32 2, i32 4, i32 8, i32 5, i32 6> - ret <8 x float> %shuffle -} - -; CHECK: palignr $8 -; CHECK: palignr $8 -define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> - ret <8 x float> %shuffle -} diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll deleted file mode 100644 index ad3dbc1..0000000 --- a/test/CodeGen/X86/avx-vshufp.ll +++ /dev/null @@ -1,157 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vshufps $-53, %ymm -define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x float> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %ymm -define <8 x float> @A2(<8 x float>* %a, <8 x float>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <8 x float>* %a - %b2 = load <8 x float>* %b - %shuffle = shufflevector <8 x float> %a2, <8 x float> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x float> %shuffle -} - -; CHECK: vshufps $-53, %ymm -define <8 x i32> @A3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x i32> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %ymm -define <8 x i32> @A4(<8 x i32>* %a, <8 x i32>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <8 x i32>* %a - %b2 = load <8 x i32>* %b - %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x i32> %shuffle -} - -; CHECK: vblendpd $10, %ymm -define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %shuffle -} - -; CHECK: vblendpd $10, (%{{.*}}), %ymm -define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x double>* %a - %b2 = load <4 x double>* %b - %shuffle = shufflevector <4 x double> %a2, <4 x double> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %shuffle -} - -; CHECK: vblendpd $10, %ymm -define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i64> %shuffle -} - -; CHECK: vblendpd $10, (%{{.*}}), %ymm -define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i64>* %a - %b2 = load <4 x i64>* %b - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i64> %shuffle -} - -; CHECK: vshufps $-53, %ymm -define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 undef, i32 undef, i32 11, i32 undef, i32 6, i32 12, i32 undef> - ret <8 x float> %shuffle -} - -; CHECK: vblendpd $2, %ymm -define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef> - ret <4 x double> %shuffle -} - -; CHECK: vshufps $-55, %ymm -define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 10, i32 0, i32 3, i32 13, i32 14, i32 4, i32 7> - ret <8 x float> %shuffle -} - -; CHECK: vshufpd $8, %ymm -define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 7> - ret <4 x double> %shuffle -} - -; CHECK: vshufps $-53, %xmm -define <4 x float> @A128(<4 x float> %a, <4 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x float> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %xmm -define <4 x float> @A2128(<4 x float>* %a, <4 x float>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x float>* %a - %b2 = load <4 x float>* %b - %shuffle = shufflevector <4 x float> %a2, <4 x float> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x float> %shuffle -} - -; CHECK: vshufps $-53, %xmm -define <4 x i32> @A3128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x i32> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %xmm -define <4 x i32> @A4128(<4 x i32>* %a, <4 x i32>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i32>* %a - %b2 = load <4 x i32>* %b - %shuffle = shufflevector <4 x i32> %a2, <4 x i32> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x i32> %shuffle -} - -; CHECK: vshufpd $1, %xmm -define <2 x double> @B128(<2 x double> %a, <2 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2> - ret <2 x double> %shuffle -} - -; CHECK: vshufpd $1, (%{{.*}}), %xmm -define <2 x double> @B2128(<2 x double>* %a, <2 x double>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <2 x double>* %a - %b2 = load <2 x double>* %b - %shuffle = shufflevector <2 x double> %a2, <2 x double> %b2, <2 x i32> <i32 1, i32 2> - ret <2 x double> %shuffle -} - -; CHECK: vshufpd $1, %xmm -define <2 x i64> @B3128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> - ret <2 x i64> %shuffle -} - -; CHECK: vshufpd $1, (%{{.*}}), %xmm -define <2 x i64> @B4128(<2 x i64>* %a, <2 x i64>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <2 x i64>* %a - %b2 = load <2 x i64>* %b - %shuffle = shufflevector <2 x i64> %a2, <2 x i64> %b2, <2 x i32> <i32 1, i32 2> - ret <2 x i64> %shuffle -} diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll deleted file mode 100644 index 7511746..0000000 --- a/test/CodeGen/X86/avx-zext.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -;CHECK-LABEL: zext_8i16_to_8i32: -;CHECK: vpunpckhwd -;CHECK: ret - - %B = zext <8 x i16> %A to <8 x i32> - ret <8 x i32>%B -} - -define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -;CHECK-LABEL: zext_4i32_to_4i64: -;CHECK: vpunpckhdq -;CHECK: ret - - %B = zext <4 x i32> %A to <4 x i64> - ret <4 x i64>%B -} - -define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { -;CHECK-LABEL: zext_8i8_to_8i32: -;CHECK: vpunpckhwd -;CHECK: vpmovzxwd -;CHECK: vinsertf128 -;CHECK: ret - %t = zext <8 x i8> %z to <8 x i32> - ret <8 x i32> %t -} - -; PR17654 -define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { -; CHECK-LABEL: zext_16i8_to_16i16: -; CHECK: vpxor -; CHECK: vpunpckhbw -; CHECK: vpunpcklbw -; CHECK: vinsertf128 -; CHECK: ret - %t = zext <16 x i8> %z to <16 x i16> - ret <16 x i16> %t -} diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 6069c14..cba6d98 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa ; X32: movl 8(%esp), %ecx ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK: vinsertps $-64, 12(%{{...}},%{{...}}), % ; CHECK-NEXT: ret %1 = getelementptr inbounds <4 x float>* %pb, i64 %index %2 = load <4 x float>* %1, align 16 diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll new file mode 100644 index 0000000..2e669b0 --- /dev/null +++ b/test/CodeGen/X86/avx1-stack-reload-folding.ll @@ -0,0 +1,68 @@ +; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests - we use the 'big vectors' pattern to guarantee spilling to stack.
+;
+; Many of these tests are primarily to check memory folding with specific instructions. Using a basic
+; load/cvt/store pattern to test for this would mean that it wouldn't be the memory folding code thats
+; being tested - the load-execute version of the instruction from the tables would be matched instead.
+
+define void @stack_fold_vmulpd(<64 x double>* %a, <64 x double>* %b, <64 x double>* %c) {
+ ;CHECK-LABEL: stack_fold_vmulpd
+ ;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <64 x double>* %a
+ %2 = load <64 x double>* %b
+ %3 = fadd <64 x double> %1, %2
+ %4 = fsub <64 x double> %1, %2
+ %5 = fmul <64 x double> %3, %4
+ store <64 x double> %5, <64 x double>* %c
+ ret void
+}
+
+define void @stack_fold_cvtdq2ps(<128 x i32>* %a, <128 x i32>* %b, <128 x float>* %c) {
+ ;CHECK-LABEL: stack_fold_cvtdq2ps
+ ;CHECK: vcvtdq2ps {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <128 x i32>* %a
+ %2 = load <128 x i32>* %b
+ %3 = and <128 x i32> %1, %2
+ %4 = xor <128 x i32> %1, %2
+ %5 = sitofp <128 x i32> %3 to <128 x float>
+ %6 = sitofp <128 x i32> %4 to <128 x float>
+ %7 = fadd <128 x float> %5, %6
+ store <128 x float> %7, <128 x float>* %c
+ ret void
+}
+
+define void @stack_fold_cvttpd2dq(<64 x double>* %a, <64 x double>* %b, <64 x i32>* %c) #0 {
+ ;CHECK-LABEL: stack_fold_cvttpd2dq
+ ;CHECK: vcvttpd2dqy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <64 x double>* %a
+ %2 = load <64 x double>* %b
+ %3 = fadd <64 x double> %1, %2
+ %4 = fsub <64 x double> %1, %2
+ %5 = fptosi <64 x double> %3 to <64 x i32>
+ %6 = fptosi <64 x double> %4 to <64 x i32>
+ %7 = or <64 x i32> %5, %6
+ store <64 x i32> %7, <64 x i32>* %c
+ ret void
+}
+
+define void @stack_fold_cvttps2dq(<128 x float>* %a, <128 x float>* %b, <128 x i32>* %c) #0 {
+ ;CHECK-LABEL: stack_fold_cvttps2dq
+ ;CHECK: vcvttps2dq {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+ %1 = load <128 x float>* %a
+ %2 = load <128 x float>* %b
+ %3 = fadd <128 x float> %1, %2
+ %4 = fsub <128 x float> %1, %2
+ %5 = fptosi <128 x float> %3 to <128 x i32>
+ %6 = fptosi <128 x float> %4 to <128 x i32>
+ %7 = or <128 x i32> %5, %6
+ store <128 x i32> %7, <128 x i32>* %c
+ ret void
+}
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll deleted file mode 100644 index b02442b..0000000 --- a/test/CodeGen/X86/avx2-blend.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s - -define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { -; CHECK-LABEL: constant_pblendvb_avx2: -; CHECK: vmovdqa -; CHECK: vpblendvb - %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd - ret <32 x i8> %1 -} - -declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll new file mode 100644 index 0000000..ac2c73b --- /dev/null +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=core-avx2 -mattr=avx2 | FileCheck %s + +define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpblendw + %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone + + +define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { + ; CHECK: vpblendd + %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpblendd + %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone + + +define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vmpsadbw + %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone + diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index ab3d591..84b22b7 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -158,21 +158,21 @@ define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) { ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { - ; CHECK: vpslldq - %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { - ; CHECK: vpslldq - %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} +
+
+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+ %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
+ ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+ %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone @@ -254,21 +254,21 @@ define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) { ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} +
+
+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
+ ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone @@ -475,10 +475,10 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK: vmpsadbw - %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1] + %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } -declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) { @@ -499,10 +499,10 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK: vpblendw - %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1] + %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } -declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { @@ -706,18 +706,18 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK: vpblendd - %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1] + %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK: vpblendd - %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1] + %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } -declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll deleted file mode 100644 index 83573dc..0000000 --- a/test/CodeGen/X86/avx2-palignr.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -define <8 x i32> @test1(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test1: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12> - ret <8 x i32> %C -} - -define <8 x i32> @test2(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test2: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 undef, i32 12> - ret <8 x i32> %C -} - -define <8 x i32> @test3(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test3: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12> - ret <8 x i32> %C -} -; -define <8 x i32> @test4(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test4: -; CHECK: vpalignr $8 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 10, i32 11, i32 undef, i32 1, i32 14, i32 15, i32 4, i32 5> - ret <8 x i32> %C -} - -define <16 x i16> @test5(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test5: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 undef, i32 6, i32 7, i32 16, i32 17, i32 18, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26> - ret <16 x i16> %C -} - -define <16 x i16> @test6(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test6: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26> - ret <16 x i16> %C -} - -define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test7: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i16> %C -} - -define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind { -; CHECK-LABEL: test8: -; CHECK: vpalignr $5 - %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52> - ret <32 x i8> %C -} diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll deleted file mode 100644 index 185b989..0000000 --- a/test/CodeGen/X86/avx2-shuffle.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; Make sure that we don't match this shuffle using the vpblendw YMM instruction. -; The mask for the vpblendw instruction needs to be identical for both halves -; of the YMM. Need to use two vpblendw instructions. - -; CHECK: vpblendw_test1 -; mask = 10010110,b = 150,d -; CHECK: vpblendw $150, %ymm -; CHECK: ret -define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 23, - i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31> - ret <16 x i16> %t -} - -; CHECK: vpblendw_test2 -; mask1 = 00010110 = 22 -; mask2 = 10000000 = 128 -; CHECK: vpblendw $128, %xmm -; CHECK: vpblendw $22, %xmm -; CHECK: vinserti128 -; CHECK: ret -define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, - i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> - ret <16 x i16> %t -} - -; CHECK: blend_test1 -; CHECK: vpblendd -; CHECK: ret -define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { - %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7> - ret <8 x i32> %t -} - -; CHECK: blend_test2 -; CHECK: vpblendd -; CHECK: ret -define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { - %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7> - ret <8 x i32> %t -} - - -; CHECK: blend_test3 -; CHECK: vblendps -; CHECK: ret -define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline { - %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7> - ret <8 x float> %t -} - -; CHECK: blend_test4 -; CHECK: vblendpd -; CHECK: ret -define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { - %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3> - ret <4 x i64> %t -} - -;; 2 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_test5 -; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0 -; CHECK: ret -define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %1 -} - -; CHECK-LABEL: @blend_test6 -; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0 -; CHECK: ret -define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) { - %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 23, - i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31> - ret <16 x i16> %1 -} - -; CHECK: vpshufhw $27, %ymm -define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpshuflw $27, %ymm -define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpshufb_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, - i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18> - ret <32 x i8>%S -} - -; CHECK: vpshufb1_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, - i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18> - ret <32 x i8>%S -} - - -; CHECK: vpshufb2_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, - i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18> - ret <32 x i8>%S -} diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll deleted file mode 100644 index 6d17443..0000000 --- a/test/CodeGen/X86/avx2-unpack.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; CHECK: vpunpckhdq -define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpckhqdq -define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckldq -define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpcklqdq -define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckhwd -define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpcklwd -define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpckhbw -define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> - ret <32 x i8> %shuffle.i -} - -; CHECK: vpunpcklbw -define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> - ret <32 x i8> %shuffle.i -} - -; CHECK: vpunpckhdq -define <8 x i32> @unpackhidq1_undef(<8 x i32> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpckhqdq -define <4 x i64> @unpackhiqdq1_undef(<4 x i64> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckhwd -define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpcklwd -define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> - ret <16 x i16> %shuffle.i -} - diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 66f586d..924c06e 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -317,7 +317,7 @@ define <4 x double> @_inreg4xdouble(<4 x double> %a) { } ;CHECK-LABEL: _inreg2xdouble: -;CHECK: vpbroadcastq +;CHECK: vunpcklpd ;CHECK: ret define <2 x double> @_inreg2xdouble(<2 x double> %a) { %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll deleted file mode 100644 index 1937db5..0000000 --- a/test/CodeGen/X86/avx2-vperm2i128.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; CHECK: vperm2i128 $17 -define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> - %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> - ret <32 x i8> %shuffle -} - -; CHECK: vperm2i128 $3 -define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i64> %shuffle -} - -; CHECK: vperm2i128 $49 -define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> - %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> - ret <8 x i32> %shuffle -} - -; CHECK: vperm2i128 $2 -define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> - %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <16 x i16> %shuffle -} - -; CHECK: vperm2i128 $2, (% -define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { -entry: - %c = load <16 x i16>* %a - %d = load <16 x i16>* %b - %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> - %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <16 x i16> %shuffle -} diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index 4d1c9f7..c43da9c 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -1,189 +1,217 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: addpd512 -; CHECK: vaddpd -; CHECK: ret define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: addpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <8 x double> %x, %y ret <8 x double> %add.i } -; CHECK-LABEL: addpd512fold -; CHECK: vaddpd LCP{{.*}}(%rip) -; CHECK: ret define <8 x double> @addpd512fold(<8 x double> %y) { +; CHECK-LABEL: addpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00> ret <8 x double> %add.i } -; CHECK-LABEL: addps512 -; CHECK: vaddps -; CHECK: ret define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: addps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <16 x float> %x, %y ret <16 x float> %add.i } -; CHECK-LABEL: addps512fold -; CHECK: vaddps LCP{{.*}}(%rip) -; CHECK: ret define <16 x float> @addps512fold(<16 x float> %y) { +; CHECK-LABEL: addps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000> ret <16 x float> %add.i } -; CHECK-LABEL: subpd512 -; CHECK: vsubpd -; CHECK: ret define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: subpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %sub.i = fsub <8 x double> %x, %y ret <8 x double> %sub.i } -; CHECK-LABEL: @subpd512fold -; CHECK: vsubpd (% -; CHECK: ret define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) { +; CHECK-LABEL: subpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %tmp2 = load <8 x double>* %x, align 8 %sub.i = fsub <8 x double> %y, %tmp2 ret <8 x double> %sub.i } -; CHECK-LABEL: @subps512 -; CHECK: vsubps -; CHECK: ret define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: subps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %sub.i = fsub <16 x float> %x, %y ret <16 x float> %sub.i } -; CHECK-LABEL: subps512fold -; CHECK: vsubps (% -; CHECK: ret define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) { +; CHECK-LABEL: subps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %tmp2 = load <16 x float>* %x, align 4 %sub.i = fsub <16 x float> %y, %tmp2 ret <16 x float> %sub.i } -; CHECK-LABEL: imulq512 -; CHECK: vpmuludq -; CHECK: vpmuludq -; CHECK: ret define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { +; CHECK-LABEL: imulq512: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm3 +; CHECK-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 +; CHECK-NEXT: vpsllq $32, %zmm3, %zmm3 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; CHECK-NEXT: vpsrlq $32, %zmm1, %zmm1 +; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq %z = mul <8 x i64>%x, %y ret <8 x i64>%z } -; CHECK-LABEL: mulpd512 -; CHECK: vmulpd -; CHECK: ret define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: mulpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <8 x double> %x, %y ret <8 x double> %mul.i } -; CHECK-LABEL: mulpd512fold -; CHECK: vmulpd LCP{{.*}}(%rip) -; CHECK: ret define <8 x double> @mulpd512fold(<8 x double> %y) { +; CHECK-LABEL: mulpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00> ret <8 x double> %mul.i } -; CHECK-LABEL: mulps512 -; CHECK: vmulps -; CHECK: ret define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: mulps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <16 x float> %x, %y ret <16 x float> %mul.i } -; CHECK-LABEL: mulps512fold -; CHECK: vmulps LCP{{.*}}(%rip) -; CHECK: ret define <16 x float> @mulps512fold(<16 x float> %y) { +; CHECK-LABEL: mulps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000> ret <16 x float> %mul.i } -; CHECK-LABEL: divpd512 -; CHECK: vdivpd -; CHECK: ret define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: divpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <8 x double> %x, %y ret <8 x double> %div.i } -; CHECK-LABEL: divpd512fold -; CHECK: vdivpd LCP{{.*}}(%rip) -; CHECK: ret define <8 x double> @divpd512fold(<8 x double> %y) { +; CHECK-LABEL: divpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00> ret <8 x double> %div.i } -; CHECK-LABEL: divps512 -; CHECK: vdivps -; CHECK: ret define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: divps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <16 x float> %x, %y ret <16 x float> %div.i } -; CHECK-LABEL: divps512fold -; CHECK: vdivps LCP{{.*}}(%rip) -; CHECK: ret define <16 x float> @divps512fold(<16 x float> %y) { +; CHECK-LABEL: divps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000> ret <16 x float> %div.i } -; CHECK-LABEL: vpaddq_test -; CHECK: vpaddq %zmm -; CHECK: ret define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { +; CHECK-LABEL: vpaddq_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <8 x i64> %i, %j ret <8 x i64> %x } -; CHECK-LABEL: vpaddq_fold_test -; CHECK: vpaddq (% -; CHECK: ret define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind { +; CHECK-LABEL: vpaddq_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load <8 x i64>* %j, align 4 %x = add <8 x i64> %i, %tmp ret <8 x i64> %x } -; CHECK-LABEL: vpaddq_broadcast_test -; CHECK: vpaddq LCP{{.*}}(%rip){1to8} -; CHECK: ret define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind { +; CHECK-LABEL: vpaddq_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> ret <8 x i64> %x } -; CHECK-LABEL: vpaddq_broadcast2_test -; CHECK: vpaddq (%rdi){1to8} -; CHECK: ret define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { +; CHECK-LABEL: vpaddq_broadcast2_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load i64* %j %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0 %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1 @@ -197,55 +225,67 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { ret <8 x i64> %x } -; CHECK-LABEL: vpaddd_test -; CHECK: vpaddd %zmm -; CHECK: ret define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { +; CHECK-LABEL: vpaddd_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <16 x i32> %i, %j ret <16 x i32> %x } -; CHECK-LABEL: vpaddd_fold_test -; CHECK: vpaddd (% -; CHECK: ret define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind { +; CHECK-LABEL: vpaddd_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load <16 x i32>* %j, align 4 %x = add <16 x i32> %i, %tmp ret <16 x i32> %x } -; CHECK-LABEL: vpaddd_broadcast_test -; CHECK: vpaddd LCP{{.*}}(%rip){1to16} -; CHECK: ret define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { +; CHECK-LABEL: vpaddd_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <16 x i32> %x } -; CHECK-LABEL: vpaddd_mask_test -; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }} -; CHECK: ret define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_mask_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_maskz_test -; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }} -; CHECK: ret define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_maskz_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_mask_fold_test -; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }} -; CHECK: ret define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_mask_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j @@ -253,20 +293,26 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_mask_broadcast_test -; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }} -; CHECK: ret define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_mask_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_maskz_fold_test -; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z} -; CHECK: ret define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_maskz_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j @@ -274,125 +320,141 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_maskz_broadcast_test -; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z} -; CHECK: ret define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_maskz_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } -; CHECK-LABEL: vpsubq_test -; CHECK: vpsubq %zmm -; CHECK: ret define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { +; CHECK-LABEL: vpsubq_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = sub <8 x i64> %i, %j ret <8 x i64> %x } -; CHECK-LABEL: vpsubd_test -; CHECK: vpsubd -; CHECK: ret define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { +; CHECK-LABEL: vpsubd_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = sub <16 x i32> %i, %j ret <16 x i32> %x } -; CHECK-LABEL: vpmulld_test -; CHECK: vpmulld %zmm -; CHECK: ret define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { +; CHECK-LABEL: vpmulld_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = mul <16 x i32> %i, %j ret <16 x i32> %x } -; CHECK-LABEL: sqrtA -; CHECK: vsqrtss {{.*}} encoding: [0x62 -; CHECK: ret declare float @sqrtf(float) readnone define float @sqrtA(float %a) nounwind uwtable readnone ssp { +; CHECK-LABEL: sqrtA: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %conv1 = tail call float @sqrtf(float %a) nounwind readnone ret float %conv1 } -; CHECK-LABEL: sqrtB -; CHECK: vsqrtsd {{.*}}## encoding: [0x62 -; CHECK: ret declare double @sqrt(double) readnone define double @sqrtB(double %a) nounwind uwtable readnone ssp { +; CHECK-LABEL: sqrtB: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %call = tail call double @sqrt(double %a) nounwind readnone ret double %call } -; CHECK-LABEL: sqrtC -; CHECK: vsqrtss {{.*}}## encoding: [0x62 -; CHECK: ret declare float @llvm.sqrt.f32(float) define float @sqrtC(float %a) nounwind { +; CHECK-LABEL: sqrtC: +; CHECK: ## BB#0: +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq %b = call float @llvm.sqrt.f32(float %a) ret float %b } -; CHECK-LABEL: sqrtD -; CHECK: vsqrtps {{.*}} -; CHECK: ret declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) define <16 x float> @sqrtD(<16 x float> %a) nounwind { +; CHECK-LABEL: sqrtD: +; CHECK: ## BB#0: +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 +; CHECK-NEXT: retq %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) ret <16 x float> %b } -; CHECK-LABEL: sqrtE -; CHECK: vsqrtpd {{.*}} -; CHECK: ret declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) define <8 x double> @sqrtE(<8 x double> %a) nounwind { +; CHECK-LABEL: sqrtE: +; CHECK: ## BB#0: +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 +; CHECK-NEXT: retq %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) ret <8 x double> %b } -; CHECK-LABEL: fadd_broadcast -; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK: ret define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind { +; CHECK-LABEL: fadd_broadcast: +; CHECK: ## BB#0: +; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000> ret <16 x float> %b } -; CHECK-LABEL: addq_broadcast -; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK: ret define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind { +; CHECK-LABEL: addq_broadcast: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> ret <8 x i64> %b } -; CHECK-LABEL: orq_broadcast -; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK: ret define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { +; CHECK-LABEL: orq_broadcast: +; CHECK: ## BB#0: +; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> ret <8 x i64> %b } -; CHECK-LABEL: andd512fold -; CHECK: vpandd (% -; CHECK: ret define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { +; CHECK-LABEL: andd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %a = load <16 x i32>* %x, align 4 %b = and <16 x i32> %y, %a ret <16 x i32> %b } -; CHECK-LABEL: andqbrst -; CHECK: vpandq (%rdi){1to8}, %zmm -; CHECK: ret define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { +; CHECK-LABEL: andqbrst: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %a = load i64* %ap, align 8 %b = insertelement <8 x i64> undef, i64 %a, i32 0 diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index b5a2aa8..9e9ad31 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -1,30 +1,43 @@ ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: test1 -; CHECK: vpxord -; CHECK: ret define <16 x i32> @test1(i32* %x) { +; CHECK-LABEL: test1: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovd (%rdi), %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %y = load i32* %x, align 4 %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 ret <16 x i32>%res } -; CHECK-LABEL: test2 -; CHECK: vpaddd LCP{{.*}}(%rip){1to16} -; CHECK: ret define <16 x i32> @test2(<16 x i32> %x) { +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x ret <16 x i32>%res } -; CHECK-LABEL: test3 -; CHECK: vinsertf128 -; CHECK: vinsertf64x4 -; CHECK: ret define <16 x float> @test3(<4 x float> %a) { +; CHECK-LABEL: test3: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5 %b1 = extractelement <4 x float> %a, i32 0 %c1 = insertelement <16 x float> %c, float %b1, i32 6 ret <16 x float>%c1 -}
\ No newline at end of file +} diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll index 47e50a9..6e0d185 100644 --- a/test/CodeGen/X86/avx512-cmp.ll +++ b/test/CodeGen/X86/avx512-cmp.ll @@ -28,10 +28,9 @@ l2: ret float %c1 } +; FIXME: Can use vcmpeqss and extract from the mask here in AVX512. ; CHECK-LABEL: test3 -; CHECK: vcmpeqss -; CHECK: kmov -; CHECK: ret +; CHECK: vucomiss {{.*}}encoding: [0x62 define i32 @test3(float %a, float %b) { %cmp10.i = fcmp oeq float %a, %b @@ -86,3 +85,17 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { %res = select i1 %tmp5, i32 1, i32 %a3 ret i32 %res } + +; CHECK-LABEL: test9 +; CHECK: testb +; CHECK-NOT: kmov +; CHECK: ret +define i32 @test9(i64 %a) { + %b = and i64 %a, 1 + %cmp10.i = icmp eq i64 %b, 0 + br i1 %cmp10.i, label %A, label %B +A: + ret i32 6 +B: + ret i32 7 +} diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index f5cda96..2b672a7 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -255,3 +255,56 @@ define double @uitofp03(i32 %a) nounwind { %b = uitofp i32 %a to double ret double %b } + +; CHECK-LABEL: @sitofp_16i1_float +; CHECK: vpbroadcastd +; CHECK: vcvtdq2ps +define <16 x float> @sitofp_16i1_float(<16 x i32> %a) { + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +; CHECK-LABEL: @sitofp_16i8_float +; CHECK: vpmovsxbd +; CHECK: vcvtdq2ps +define <16 x float> @sitofp_16i8_float(<16 x i8> %a) { + %1 = sitofp <16 x i8> %a to <16 x float> + ret <16 x float> %1 +} + +; CHECK-LABEL: @sitofp_16i16_float +; CHECK: vpmovsxwd +; CHECK: vcvtdq2ps +define <16 x float> @sitofp_16i16_float(<16 x i16> %a) { + %1 = sitofp <16 x i16> %a to <16 x float> + ret <16 x float> %1 +} + +; CHECK-LABEL: @sitofp_8i16_double +; CHECK: vpmovsxwd +; CHECK: vcvtdq2pd +define <8 x double> @sitofp_8i16_double(<8 x i16> %a) { + %1 = sitofp <8 x i16> %a to <8 x double> + ret <8 x double> %1 +} + +; CHECK-LABEL: sitofp_8i8_double +; CHECK: vpmovzxwd +; CHECK: vpslld +; CHECK: vpsrad +; CHECK: vcvtdq2pd +define <8 x double> @sitofp_8i8_double(<8 x i8> %a) { + %1 = sitofp <8 x i8> %a to <8 x double> + ret <8 x double> %1 +} + + +; CHECK-LABEL: @sitofp_8i1_double +; CHECK: vpbroadcastq +; CHECK: vcvtdq2pd +define <8 x double> @sitofp_8i1_double(<8 x double> %a) { + %cmpres = fcmp ogt <8 x double> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x double> + ret <8 x double> %1 +} diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll index ce3d759..366d324 100644 --- a/test/CodeGen/X86/avx512-fma-intrinsics.ll +++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -1,97 +1,113 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmadd_ps_z ; CHECK: vfmadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmadd_pd_z ; CHECK: vfmadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd_pd: +; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmsubps_z ; CHECK: vfmsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubpd_z ; CHECK: vfmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_ps_z ; CHECK: vfnmadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_pd_z ; CHECK: vfnmadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmsubps_z ; CHECK: vfnmsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmsubpd_z ; CHECK: vfnmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubps_z ; CHECK: vfmaddsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; CHECK-LABEL: test_mask_fmaddsub_ps: +; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubpd_z ; CHECK: vfmaddsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddps_z ; CHECK: vfmsubadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddpd_z ; CHECK: vfmsubadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index b360c71..eba895e 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s ;CHECK-LABEL: test1: ;CHECK: vinsertps @@ -12,9 +13,11 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { } ;CHECK-LABEL: test2: -;CHECK: vinsertf32x4 -;CHECK: vextractf32x4 -;CHECK: vinsertf32x4 +;KNL: vinsertf32x4 $0 +;SKX: vinsertf64x2 $0 +;CHECK: vextractf32x4 $3 +;KNL: vinsertf32x4 $3 +;SKX: vinsertf64x2 $3 ;CHECK: ret define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { %rrr = load double* %br @@ -24,8 +27,8 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { } ;CHECK-LABEL: test3: -;CHECK: vextractf32x4 -;CHECK: vinsertf32x4 +;CHECK: vextractf32x4 $1 +;CHECK: vinsertf32x4 $0 ;CHECK: ret define <16 x float> @test3(<16 x float> %x) nounwind { %eee = extractelement <16 x float> %x, i32 4 @@ -34,8 +37,9 @@ define <16 x float> @test3(<16 x float> %x) nounwind { } ;CHECK-LABEL: test4: -;CHECK: vextracti32x4 -;CHECK: vinserti32x4 +;CHECK: vextracti32x4 $2 +;KNL: vinserti32x4 $0 +;SKX: vinserti64x2 $0 ;CHECK: ret define <8 x i64> @test4(<8 x i64> %x) nounwind { %eee = extractelement <8 x i64> %x, i32 4 @@ -186,12 +190,13 @@ define i16 @test16(i1 *%addr, i16 %a) { ;CHECK-LABEL: test17 ;CHECK: kshiftlw ;CHECK: kshiftrw -;CHECK: korw +;KNL: korw +;SKX: korb ;CHECK: ret define i8 @test17(i1 *%addr, i8 %a) { %x = load i1 * %addr, align 128 %a1 = bitcast i8 %a to <8 x i1> - %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10 + %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4 %x2 = bitcast <8 x i1>%x1 to i8 ret i8 %x2 } diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 18cfcfe..691d1fb 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -60,20 +60,6 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) { } declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone -define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { - ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] - %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { - ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] - %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1] - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone - declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) define <8 x double> @test7(<8 x double> %a) { @@ -97,13 +83,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) { } declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone -define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) { - ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] - %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) { ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0] %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1] @@ -111,13 +90,6 @@ define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) { } declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { - ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - define <4 x float> @test_rcp14_ss(<4 x float> %a0) { ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0] %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1] @@ -125,26 +97,19 @@ define <4 x float> @test_rcp14_ss(<4 x float> %a0) { } declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -define <4 x float> @test_rcp28_ss(<4 x float> %a0) { - ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] - %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { ; CHECK: vsqrtpd - %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] + %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1] ret <8 x double> %res } -declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { ; CHECK: vsqrtps - %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1] ret <16 x float> %res } -declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vsqrtss {{.*}}encoding: [0x62 @@ -611,3 +576,515 @@ define <8 x i64> @test_vmovntdqa(i8 *%x) { } declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) + +define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_valign_q: +; CHECK: valignq $2, %zmm1, %zmm0, %zmm0 + %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) { +; CHECK-LABEL: test_mask_valign_q: +; CHECK: valignq $2, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> %src, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i8, <8 x i64>, i8) + +define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_maskz_valign_d: +; CHECK: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05] + %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i8 5, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i8, <16 x i32>, i16) + +define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) { + ; CHECK-LABEL: test_mask_store_ss + ; CHECK: vmovss %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x11,0x07] + call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 ) + +define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16) + +define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8) + +define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_pcmpgt_d +; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_d +; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16) + +define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_pcmpgt_q +; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_q +; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) + +define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK_LABEL: test_cmp_d_512 +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpled %zmm1, %zmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_cmp_d_512 +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK_LABEL: test_ucmp_d_512 +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_ucmp_d_512 +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone + +define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK_LABEL: test_cmp_q_512 +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_q_512 +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK_LABEL: test_ucmp_q_512 +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_q_512 +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextractf32x4: +; CHECK: vextractf32x4 $2, %zmm1, %xmm0 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i8 2, <4 x float> %b, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i8, <4 x float>, i8) + +define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextracti64x4: +; CHECK: vextracti64x4 $2, %zmm1, %ymm0 {%k1} + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i8 2, <4 x i64> %b, i8 %mask) + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i8, <4 x i64>, i8) + +define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vextracti32x4: +; CHECK: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} + %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i8 2, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i8, <4 x i32>, i8) + +define <4 x double> @test_vextractf64x4(<8 x double> %a) { +; CHECK-LABEL: test_vextractf64x4: +; CHECK: vextractf64x4 $2, %zmm0, %ymm0 ## + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i8 2, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i8, <4 x double>, i8) + +define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) { + ; CHECK-LABEL: test_x86_avx512_pslli_d + ; CHECK: vpslld + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_pslli_d + ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d + ; CHECK: vpslld $7, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) { + ; CHECK-LABEL: test_x86_avx512_pslli_q + ; CHECK: vpsllq + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_pslli_q + ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q + ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrli_d + ; CHECK: vpsrld + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrli_d + ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d + ; CHECK: vpsrld $7, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrli_q + ; CHECK: vpsrlq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrli_q + ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q + ; CHECK: vpsrlq $7, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrai_d + ; CHECK: vpsrad + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrai_d + ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d + ; CHECK: vpsrad $7, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrai_q + ; CHECK: vpsraq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrai_q + ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q + ; CHECK: vpsraq $7, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index dd33ffd..35d3348 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1,12 +1,14 @@ -; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s define i16 @mask16(i16 %x) { %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> %ret = bitcast <16 x i1> %m1 to i16 ret i16 %ret -; CHECK: mask16 -; CHECK: knotw +; CHECK-LABEL: mask16 +; CHECK: kmovw +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw ; CHECK: ret } @@ -15,8 +17,38 @@ define i8 @mask8(i8 %x) { %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> %ret = bitcast <8 x i1> %m1 to i8 ret i8 %ret -; CHECK: mask8 -; CHECK: knotw +; CHECK-LABEL: mask8 +; CHECK: kmovw +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw +; CHECK: ret +} + +define void @mask16_mem(i16* %ptr) { + %x = load i16* %ptr, align 4 + %m0 = bitcast i16 %x to <16 x i1> + %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <16 x i1> %m1 to i16 + store i16 %ret, i16* %ptr, align 4 + ret void +; CHECK-LABEL: mask16_mem +; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) +; CHECK: ret +} + +define void @mask8_mem(i8* %ptr) { + %x = load i8* %ptr, align 4 + %m0 = bitcast i8 %x to <8 x i1> + %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <8 x i1> %m1 to i8 + store i8 %ret, i8* %ptr, align 4 + ret void +; CHECK-LABEL: mask8_mem +; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}} +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) ; CHECK: ret } diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll index 009802f..93875e8 100644 --- a/test/CodeGen/X86/avx512-mov.ll +++ b/test/CodeGen/X86/avx512-mov.ll @@ -153,31 +153,295 @@ define void @test18(i8 * %addr, <8 x i64> %data) { ret void } -; CHECK-LABEL: store_i1_1 -; CHECK: movb -; CHECK: movb +; CHECK-LABEL: test19 +; CHECK: vmovdqu32 +; CHECK: ret +define void @test19(i8 * %addr, <16 x i32> %data) { + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test20 +; CHECK: vmovdqa32 +; CHECK: ret +define void @test20(i8 * %addr, <16 x i32> %data) { + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 64 + ret void +} + +; CHECK-LABEL: test21 +; CHECK: vmovdqa64 ; CHECK: ret -define void @store_i1_1() { - store i1 true, i1 addrspace(3)* undef, align 128 - store i1 false, i1 addrspace(2)* undef, align 128 +define <8 x i64> @test21(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>* %vaddr, align 64 + ret <8 x i64>%res +} + +; CHECK-LABEL: test22 +; CHECK: vmovdqu64 +; CHECK: ret +define void @test22(i8 * %addr, <8 x i64> %data) { + %vaddr = bitcast i8* %addr to <8 x i64>* + store <8 x i64>%data, <8 x i64>* %vaddr, align 1 ret void } -; CHECK-LABEL: store_i1_2 -; CHECK: movb +; CHECK-LABEL: test23 +; CHECK: vmovdqu64 ; CHECK: ret -define void @store_i1_2(i64 %a, i64 %b) { - %res = icmp eq i64 %a, %b - store i1 %res, i1 addrspace(3)* undef, align 128 +define <8 x i64> @test23(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>* %vaddr, align 1 + ret <8 x i64>%res +} + +; CHECK-LABEL: test24 +; CHECK: vmovapd +; CHECK: ret +define void @test24(i8 * %addr, <8 x double> %data) { + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 64 ret void } -; CHECK-LABEL: store_i1_3 -; CHECK: kmovw +; CHECK-LABEL: test25 +; CHECK: vmovapd +; CHECK: ret +define <8 x double> @test25(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>* %vaddr, align 64 + ret <8 x double>%res +} + +; CHECK-LABEL: test26 +; CHECK: vmovaps ; CHECK: ret -define void @store_i1_3(i16 %a) { - %a_vec = bitcast i16 %a to <16 x i1> - %res = extractelement <16 x i1> %a_vec, i32 4 - store i1 %res, i1 addrspace(3)* undef, align 128 +define void @test26(i8 * %addr, <16 x float> %data) { + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 64 ret void } + +; CHECK-LABEL: test27 +; CHECK: vmovaps +; CHECK: ret +define <16 x float> @test27(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>* %vaddr, align 64 + ret <16 x float>%res +} + +; CHECK-LABEL: test28 +; CHECK: vmovupd +; CHECK: ret +define void @test28(i8 * %addr, <8 x double> %data) { + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test29 +; CHECK: vmovupd +; CHECK: ret +define <8 x double> @test29(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>* %vaddr, align 1 + ret <8 x double>%res +} + +; CHECK-LABEL: test30 +; CHECK: vmovups +; CHECK: ret +define void @test30(i8 * %addr, <16 x float> %data) { + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test31 +; CHECK: vmovups +; CHECK: ret +define <16 x float> @test31(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>* %vaddr, align 1 + ret <16 x float>%res +} + +; CHECK-LABEL: test32 +; CHECK: vmovdqa32{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +; CHECK-LABEL: test33 +; CHECK: vmovdqu32{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +; CHECK-LABEL: test34 +; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +; CHECK-LABEL: test35 +; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +; CHECK-LABEL: test36 +; CHECK: vmovdqa64{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +; CHECK-LABEL: test37 +; CHECK: vmovdqu64{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +; CHECK-LABEL: test38 +; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +; CHECK-LABEL: test39 +; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +; CHECK-LABEL: test40 +; CHECK: vmovaps{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +; CHECK-LABEL: test41 +; CHECK: vmovups{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +; CHECK-LABEL: test42 +; CHECK: vmovaps{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +; CHECK-LABEL: test43 +; CHECK: vmovups{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +; CHECK-LABEL: test44 +; CHECK: vmovapd{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +; CHECK-LABEL: test45 +; CHECK: vmovupd{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +; CHECK-LABEL: test46 +; CHECK: vmovapd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} + +; CHECK-LABEL: test47 +; CHECK: vmovupd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll index 83f4698..0dbf286 100644 --- a/test/CodeGen/X86/avx512-select.ll +++ b/test/CodeGen/X86/avx512-select.ll @@ -39,3 +39,56 @@ define double @select03(double %a, double %b, double %c, double %eps) { %cond = select i1 %cmp, double %c, double %b ret double %cond } + +; CHECK-LABEL: @select04 +; CHECK: vmovaps %zmm3, %zmm1 +; CHECK-NEXT: ret +; PR20677 +define <16 x double> @select04(<16 x double> %a, <16 x double> %b) { + %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b + ret <16 x double> %sel +} + +; CHECK-LABEL: select05 +; CHECK: kmovw %esi, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +define i8 @select05(i8 %a.0, i8 %m) { + %mask = bitcast i8 %m to <8 x i1> + %a = bitcast i8 %a.0 to <8 x i1> + %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a + %res = bitcast <8 x i1> %r to i8 + ret i8 %res; +} + +; CHECK-LABEL: select06 +; CHECK: kmovw %esi, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kandw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +define i8 @select06(i8 %a.0, i8 %m) { + %mask = bitcast i8 %m to <8 x i1> + %a = bitcast i8 %a.0 to <8 x i1> + %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer + %res = bitcast <8 x i1> %r to i8 + ret i8 %res; +} + +; CHECK-LABEL: select07 +; CHECK-DAG: kmovw %edx, %k0 +; CHECK-DAG: kmovw %edi, %k1 +; CHECK-DAG: kmovw %esi, %k2 +; CHECK: kandw %k0, %k1, %k1 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: kandw %k0, %k2, %k0 +; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) { + %mask = bitcast i8 %m to <8 x i1> + %a = bitcast i8 %a.0 to <8 x i1> + %b = bitcast i8 %b.0 to <8 x i1> + %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> %b + %res = bitcast <8 x i1> %r to i8 + ret i8 %res; +} diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll deleted file mode 100644 index b99e89a..0000000 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ /dev/null @@ -1,314 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s -; CHECK: LCP -; CHECK: .long 2 -; CHECK: .long 5 -; CHECK: .long 0 -; CHECK: .long 0 -; CHECK: .long 7 -; CHECK: .long 0 -; CHECK: .long 10 -; CHECK: .long 1 -; CHECK: .long 0 -; CHECK: .long 5 -; CHECK: .long 0 -; CHECK: .long 4 -; CHECK: .long 7 -; CHECK: .long 0 -; CHECK: .long 10 -; CHECK: .long 1 -; CHECK-LABEL: test1: -; CHECK: vpermps -; CHECK: ret -define <16 x float> @test1(<16 x float> %a) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> - ret <16 x float> %c -} - -; CHECK-LABEL: test2: -; CHECK: vpermd -; CHECK: ret -define <16 x i32> @test2(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> - ret <16 x i32> %c -} - -; CHECK-LABEL: test3: -; CHECK: vpermq -; CHECK: ret -define <8 x i64> @test3(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1> - ret <8 x i64> %c -} - -; CHECK-LABEL: test4: -; CHECK: vpermpd -; CHECK: ret -define <8 x double> @test4(<8 x double> %a) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x double> %c -} - -; CHECK-LABEL: test5: -; CHECK: vpermt2pd -; CHECK: ret -define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - ret <8 x double> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test5m: -; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}} -define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer - ret <8 x double> %res -} - -; CHECK-LABEL: test6: -; CHECK: vpermq $30 -; CHECK: ret -define <8 x i64> @test6(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4> - ret <8 x i64> %c -} - -; CHECK-LABEL: test7: -; CHECK: vpermt2q -; CHECK: ret -define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - ret <8 x i64> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test7m: -; CHECK: vpermt2q {{.* {%k[1-7]} {z}}} -define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; The mem variant of vpermt2 with a writemask -; CHECK-LABEL: test7mm: -; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}} -define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind { - %b = load <8 x i64>* %pb - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; CHECK-LABEL: test8: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x i32> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test8m: -; CHECK: vpermt2d {{.* {%k[1-7]} {z}}} -define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer - ret <16 x i32> %res -} - -; The mem variant of vpermt2 with a writemask -; CHECK-LABEL: test8mm: -; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}} -define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind { - %b = load <16 x i32> * %pb - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer - ret <16 x i32> %res -} - -; CHECK-LABEL: test9: -; CHECK: vpermt2ps -; CHECK: ret -define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x float> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test9m: -; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z} -define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer - ret <16 x float> %res -} - -; CHECK-LABEL: test10: -; CHECK: vpermt2ps ( -; CHECK: ret -define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { - %c = load <16 x float>* %b - %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x float> %d -} - -; CHECK-LABEL: test11: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { - %c = load <16 x i32>* %b - %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x i32> %d -} - -; CHECK-LABEL: test12 -; CHECK: vmovlhps {{.*}}## encoding: [0x62 -; CHECK: ret -define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind { - %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - ret <4 x i32> %c -} - -; CHECK-LABEL: test13 -; CHECK: vpermilps $-79, %zmm -; CHECK: ret -define <16 x float> @test13(<16 x float> %a) { - %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> - ret <16 x float> %b -} - -; CHECK-LABEL: test14 -; CHECK: vpermilpd $-53, %zmm -; CHECK: ret -define <8 x double> @test14(<8 x double> %a) { - %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7> - ret <8 x double> %b -} - -; CHECK-LABEL: test15 -; CHECK: vpshufd $-79, %zmm -; CHECK: ret -define <16 x i32> @test15(<16 x i32> %a) { - %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> - ret <16 x i32> %b -} -; CHECK-LABEL: test16 -; CHECK: valignq $2, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> - ret <8 x double> %c -} - -; CHECK-LABEL: test17 -; CHECK: vshufpd $19, %zmm1, %zmm0 -; CHECK: ret -define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef> - ret <8 x double> %c -} - -; CHECK-LABEL: test18 -; CHECK: vpunpckhdq %zmm -; CHECK: ret -define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31> - ret <16 x i32> %b -} - -; CHECK-LABEL: test19 -; CHECK: vpunpckldq %zmm -; CHECK: ret -define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29> - ret <16 x i32> %b -} - -; CHECK-LABEL: test20 -; CHECK: vpunpckhqdq %zmm -; CHECK: ret -define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { - %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15> - ret <8 x i64> %b -} - -; CHECK-LABEL: test21 -; CHECK: vunpcklps %zmm -; CHECK: ret -define <16 x float> @test21(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29> - ret <16 x float> %b -} - -; CHECK-LABEL: test22 -; CHECK: vmovhlps {{.*}}## encoding: [0x62 -; CHECK: ret -define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind { - %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> - ret <4 x i32> %c -} - -; CHECK-LABEL: @test23 -; CHECK: vshufps $-112, %zmm -; CHECK: ret -define <16 x float> @test23(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30> - ret <16 x float> %b -} - -; CHECK-LABEL: @test24 -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test25 -; CHECK: vshufps $52 -; CHECK: ret -define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test26 -; CHECK: vmovshdup -; CHECK: ret -define <16 x i32> @test26(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test27 -; CHECK: ret -define <16 x i32> @test27(<4 x i32>%a) { - %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %res -} - -; CHECK-LABEL: @test28 -; CHECK: vinserti64x4 $1 -; CHECK: ret -define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) { - %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, - i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> - ret <16 x i32> %res -} - -; CHECK-LABEL: @test29 -; CHECK: vinserti64x4 $0 -; CHECK: ret -define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) { - %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, - i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - ret <16 x i32> %res -} - diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll index 5e097be..91ef5d5 100644 --- a/test/CodeGen/X86/avx512-trunc-ext.ll +++ b/test/CodeGen/X86/avx512-trunc-ext.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s ; CHECK-LABEL: trunc_16x32_to_16x8 ; CHECK: vpmovdb @@ -118,6 +119,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; CHECK-LABEL: sext_8i1_8i32 ; CHECK: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z} +; SKX: vpmovm2d ; CHECK: ret define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { %x = icmp slt <8 x i32> %a1, %a2 @@ -135,9 +137,8 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { } ; CHECK-LABEL: trunc_i32_to_i1 -; CHECK: andl -; CHECK: kmov -; CHECK: kortest +; CHECK: testb +; CHECK: setne ; CKECK: orl ; CHECK: ret define i16 @trunc_i32_to_i1(i32 %a) { @@ -146,3 +147,30 @@ define i16 @trunc_i32_to_i1(i32 %a) { %res = bitcast <16 x i1> %maskv to i16 ret i16 %res } + +; CHECK-LABEL: sext_8i1_8i16 +; SKX: vpmovm2w +; CHECK: ret +define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { + %x = icmp slt <8 x i32> %a1, %a2 + %y = sext <8 x i1> %x to <8 x i16> + ret <8 x i16> %y +} + +; CHECK-LABEL: sext_16i1_16i32 +; SKX: vpmovm2d +; CHECK: ret +define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { + %x = icmp slt <16 x i32> %a1, %a2 + %y = sext <16 x i1> %x to <16 x i32> + ret <16 x i32> %y +} + +; CHECK-LABEL: sext_8i1_8i64 +; SKX: vpmovm2q +; CHECK: ret +define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { + %x = icmp slt <8 x i32> %a1, %a2 + %y = sext <8 x i1> %x to <8 x i64> + ret <8 x i64> %y +} diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 9c6db11..0b0e0fc 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -1,59 +1,72 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -;CHECK-LABEL: _inreg16xi32: -;CHECK: vpbroadcastd {{.*}}, %zmm -;CHECK: ret define <16 x i32> @_inreg16xi32(i32 %a) { +; CHECK-LABEL: _inreg16xi32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 +; CHECK-NEXT: retq %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %c } -;CHECK-LABEL: _inreg8xi64: -;CHECK: vpbroadcastq {{.*}}, %zmm -;CHECK: ret define <8 x i64> @_inreg8xi64(i64 %a) { +; CHECK-LABEL: _inreg8xi64: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 +; CHECK-NEXT: retq %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64> %c } -;CHECK-LABEL: _inreg16xfloat: -;CHECK: vbroadcastss {{.*}}, %zmm -;CHECK: ret define <16 x float> @_inreg16xfloat(float %a) { +; CHECK-LABEL: _inreg16xfloat: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: retq %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %c } -;CHECK-LABEL: _inreg8xdouble: -;CHECK: vbroadcastsd {{.*}}, %zmm -;CHECK: ret define <8 x double> @_inreg8xdouble(double %a) { +; CHECK-LABEL: _inreg8xdouble: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: retq %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %c } -;CHECK-LABEL: _xmm16xi32 -;CHECK: vpbroadcastd -;CHECK: ret define <16 x i32> @_xmm16xi32(<16 x i32> %a) { +; CHECK-LABEL: _xmm16xi32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 +; CHECK-NEXT: retq %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b } -;CHECK-LABEL: _xmm16xfloat -;CHECK: vbroadcastss {{.*}}## encoding: [0x62 -;CHECK: ret define <16 x float> @_xmm16xfloat(<16 x float> %a) { +; CHECK-LABEL: _xmm16xfloat: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: retq %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b } define <16 x i32> @test_vbroadcast() { - ; CHECK: vpbroadcastd +; CHECK-LABEL: test_vbroadcast: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vcmpunordps %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; CHECK-NEXT: knotw %k1, %k1 +; CHECK-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq entry: %0 = sext <16 x i1> zeroinitializer to <16 x i32> %1 = fcmp uno <16 x float> undef, zeroinitializer @@ -62,3 +75,108 @@ entry: ret <16 x i32> %3 } +; We implement the set1 intrinsics with vector initializers. Verify that the +; IR generated will produce broadcasts at the end. +define <8 x double> @test_set1_pd(double %d) #2 { +; CHECK-LABEL: test_set1_pd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <8 x double> undef, double %d, i32 0 + %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1 + %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2 + %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3 + %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4 + %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5 + %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6 + %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7 + ret <8 x double> %vecinit7.i +} + +define <8 x i64> @test_set1_epi64(i64 %d) #2 { +; CHECK-LABEL: test_set1_epi64: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0 + %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1 + %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2 + %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3 + %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4 + %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5 + %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6 + %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7 + ret <8 x i64> %vecinit7.i +} + +define <16 x float> @test_set1_ps(float %f) #2 { +; CHECK-LABEL: test_set1_ps: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <16 x float> undef, float %f, i32 0 + %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1 + %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2 + %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3 + %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4 + %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5 + %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6 + %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7 + %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8 + %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9 + %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10 + %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11 + %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12 + %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13 + %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14 + %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15 + ret <16 x float> %vecinit15.i +} + +define <16 x i32> @test_set1_epi32(i32 %f) #2 { +; CHECK-LABEL: test_set1_epi32: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0 + %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1 + %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2 + %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3 + %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4 + %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5 + %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6 + %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7 + %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8 + %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9 + %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10 + %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11 + %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12 + %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13 + %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14 + %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15 + ret <16 x i32> %vecinit15.i +} + +; We implement the scalar broadcast intrinsics with vector initializers. +; Verify that the IR generated will produce the broadcast at the end. +define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { +; CHECK-LABEL: test_mm512_broadcastsd_pd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = extractelement <2 x double> %a, i32 0 + %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 + %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1 + %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2 + %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3 + %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4 + %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5 + %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6 + %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7 + ret <8 x double> %vecinit7.i +} diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index d762f00..c71e60e 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1,145 +1,176 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: test1 -; CHECK: vcmpleps -; CHECK: vmovups -; CHECK: ret define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { +; CHECK-LABEL: test1: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y ret <16 x float> %max } -; CHECK-LABEL: test2 -; CHECK: vcmplepd -; CHECK: vmovupd -; CHECK: ret define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y ret <8 x double> %max } -; CHECK-LABEL: test3 -; CHECK: vpcmpeqd (%rdi) -; CHECK: vmovdqu32 -; CHECK: ret define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind { +; CHECK-LABEL: test3: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %y = load <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 ret <16 x i32> %max } -; CHECK-LABEL: @test4_unsigned -; CHECK: vpcmpnltud -; CHECK: vmovdqu32 -; CHECK: ret define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind { +; CHECK-LABEL: test4_unsigned: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = icmp uge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y ret <16 x i32> %max } -; CHECK-LABEL: test5 -; CHECK: vpcmpeqq {{.*}}%k1 -; CHECK: vmovdqu64 {{.*}}%k1 -; CHECK: ret define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind { +; CHECK-LABEL: test5: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y ret <8 x i64> %max } -; CHECK-LABEL: test6_unsigned -; CHECK: vpcmpnleuq {{.*}}%k1 -; CHECK: vmovdqu64 {{.*}}%k1 -; CHECK: ret define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind { +; CHECK-LABEL: test6_unsigned: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = icmp ugt <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y ret <8 x i64> %max } -; CHECK-LABEL: test7 -; CHECK: xor -; CHECK: vcmpltps -; CHECK: vblendvps -; CHECK: ret define <4 x float> @test7(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test7: +; CHECK: ## BB#0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpltps %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %mask = fcmp olt <4 x float> %a, zeroinitializer %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b ret <4 x float>%c } -; CHECK-LABEL: test8 -; CHECK: xor -; CHECK: vcmpltpd -; CHECK: vblendvpd -; CHECK: ret define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test8: +; CHECK: ## BB#0: +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %mask = fcmp olt <2 x double> %a, zeroinitializer %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b ret <2 x double>%c } -; CHECK-LABEL: test9 -; CHECK: vpcmpeqd -; CHECK: vpblendmd -; CHECK: ret define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { +; CHECK-LABEL: test9: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def> +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def> +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill> +; CHECK-NEXT: retq %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max } -; CHECK-LABEL: test10 -; CHECK: vcmpeqps -; CHECK: vblendmps -; CHECK: ret define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { +; CHECK-LABEL: test10: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def> +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def> +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill> +; CHECK-NEXT: retq %mask = fcmp oeq <8 x float> %x, %y %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y ret <8 x float> %max } -; CHECK-LABEL: test11_unsigned -; CHECK: vpmaxud -; CHECK: ret define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind { +; CHECK-LABEL: test11_unsigned: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq %mask = icmp ugt <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max } -; CHECK-LABEL: test12 -; CHECK: vpcmpeqq %zmm2, %zmm0, [[LO:%k[0-7]]] -; CHECK: vpcmpeqq %zmm3, %zmm1, [[HI:%k[0-7]]] -; CHECK: kunpckbw [[LO]], [[HI]], {{%k[0-7]}} define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { +; CHECK-LABEL: test12: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: AX<def> AX<kill> EAX<kill> +; CHECK-NEXT: retq %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 ret i16 %res1 } -; CHECK-LABEL: test13 -; CHECK: vcmpeqps %zmm -; CHECK: vpbroadcastd -; CHECK: ret define <16 x i32> @test13(<16 x float>%a, <16 x float>%b) +; CHECK-LABEL: test13: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; CHECK-NEXT: retq { %cmpvector_i = fcmp oeq <16 x float> %a, %b %conv = zext <16 x i1> %cmpvector_i to <16 x i32> ret <16 x i32> %conv } -; CHECK-LABEL: test14 -; CHECK: vpcmp -; CHECK-NOT: vpcmp -; CHECK: vmovdqu32 {{.*}}{%k1} {z} -; CHECK: ret define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) { +; CHECK-LABEL: test14: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: knotw %k0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %sub_r = sub <16 x i32> %a, %b %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32> @@ -148,12 +179,15 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) { ret <16 x i32>%res } -; CHECK-LABEL: test15 -; CHECK: vpcmpgtq -; CHECK-NOT: vpcmp -; CHECK: vmovdqu64 {{.*}}{%k1} {z} -; CHECK: ret define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) { +; CHECK-LABEL: test15: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: knotw %k0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %sub_r = sub <8 x i64> %a, %b %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64> @@ -162,3 +196,181 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) { ret <8 x i64>%res } +define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y) nounwind { +; CHECK-LABEL: test16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask = icmp sge <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { +; CHECK-LABEL: test17: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp sgt <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { +; CHECK-LABEL: test18: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp sle <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { +; CHECK-LABEL: test19: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp ule <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind { +; CHECK-LABEL: test20: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp eq <16 x i32> %x1, %y1 + %mask0 = icmp eq <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind { +; CHECK-LABEL: test21: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <8 x i64> %x1, %y1 + %mask0 = icmp sle <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { +; CHECK-LABEL: test22: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sgt <8 x i64> %x1, %y1 + %y = load <8 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { +; CHECK-LABEL: test23: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <16 x i32> %x1, %y1 + %y = load <16 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { +; CHECK-LABEL: test24: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer + %mask = icmp eq <8 x i64> %x, %y + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind { +; CHECK-LABEL: test25: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer + %mask = icmp sle <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { +; CHECK-LABEL: test26: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <16 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer + %mask0 = icmp sgt <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { +; CHECK-LABEL: test27: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <8 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer + %mask0 = icmp sle <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll deleted file mode 100644 index 07ded13..0000000 --- a/test/CodeGen/X86/avx512-zext-load-crash.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - -define <8 x i16> @test_zext_load() { - ; CHECK: vmovq -entry: - %0 = load <2 x i16> ** undef, align 8 - %1 = getelementptr inbounds <2 x i16>* %0, i64 1 - %2 = load <2 x i16>* %0, align 1 - %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %4 = load <2 x i16>* %1, align 1 - %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %6 -} diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll new file mode 100644 index 0000000..bbc418c --- /dev/null +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -0,0 +1,305 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s + +define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ## + %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) + ret i64 %res +} + +define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ## + %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) + ret i64 %res +} + +declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64) + +define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32) + +define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: test_pcmpgt_b +; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ## + %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) + ret i64 %res +} + +define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_b +; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ## + %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) + ret i64 %res +} + +declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64) + +define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_pcmpgt_w +; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_w +; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32) + +define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { +; CHECK_LABEL: test_cmp_b_512 +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ## + %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ## + %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ## + %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ## + %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ## + %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ## + %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ## + %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ## + %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { +; CHECK_LABEL: test_mask_cmp_b_512 +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone + +define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { +; CHECK_LABEL: test_ucmp_b_512 +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ## + %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ## + %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ## + %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ## + %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ## + %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ## + %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ## + %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ## + %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { +; CHECK_LABEL: test_mask_ucmp_b_512 +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone + +define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { +; CHECK_LABEL: test_cmp_w_512 +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmplew %zmm1, %zmm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_cmp_w_512 +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone + +define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { +; CHECK_LABEL: test_ucmp_w_512 +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_ucmp_w_512 +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll new file mode 100644 index 0000000..9d7630c --- /dev/null +++ b/test/CodeGen/X86/avx512bw-mask-op.ll @@ -0,0 +1,99 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +define i32 @mask32(i32 %x) { + %m0 = bitcast i32 %x to <32 x i1> + %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <32 x i1> %m1 to i32 + ret i32 %ret +; CHECK-LABEL: mask32 +; CHECK: kmovd +; CHECK-NEXT: knotd +; CHECK-NEXT: kmovd +; CHECK_NEXT: ret +} + +define i64 @mask64(i64 %x) { + %m0 = bitcast i64 %x to <64 x i1> + %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <64 x i1> %m1 to i64 + ret i64 %ret +; CHECK-LABEL: mask64 +; CHECK: kmovq +; CHECK-NEXT: knotq +; CHECK-NEXT: kmovq +; CHECK_NEXT: ret +} + +define void @mask32_mem(i32* %ptr) { + %x = load i32* %ptr, align 4 + %m0 = bitcast i32 %x to <32 x i1> + %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <32 x i1> %m1 to i32 + store i32 %ret, i32* %ptr, align 4 + ret void +; CHECK-LABEL: mask32_mem +; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotd +; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]]) +; CHECK_NEXT: ret +} + +define void @mask64_mem(i64* %ptr) { + %x = load i64* %ptr, align 4 + %m0 = bitcast i64 %x to <64 x i1> + %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <64 x i1> %m1 to i64 + store i64 %ret, i64* %ptr, align 4 + ret void +; CHECK-LABEL: mask64_mem +; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}} +; CHECK-NEXT: knotq +; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]]) +; CHECK_NEXT: ret +} + +define i32 @mand32(i32 %x, i32 %y) { + %ma = bitcast i32 %x to <32 x i1> + %mb = bitcast i32 %y to <32 x i1> + %mc = and <32 x i1> %ma, %mb + %md = xor <32 x i1> %ma, %mb + %me = or <32 x i1> %mc, %md + %ret = bitcast <32 x i1> %me to i32 +; CHECK: kandd +; CHECK: kxord +; CHECK: kord + ret i32 %ret +} + +define i64 @mand64(i64 %x, i64 %y) { + %ma = bitcast i64 %x to <64 x i1> + %mb = bitcast i64 %y to <64 x i1> + %mc = and <64 x i1> %ma, %mb + %md = xor <64 x i1> %ma, %mb + %me = or <64 x i1> %mc, %md + %ret = bitcast <64 x i1> %me to i64 +; CHECK: kandq +; CHECK: kxorq +; CHECK: korq + ret i64 %ret +} diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll new file mode 100644 index 0000000..2ff6d28 --- /dev/null +++ b/test/CodeGen/X86/avx512bw-mov.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <64 x i8>* + %res = load <64 x i8>* %vaddr, align 1 + ret <64 x i8>%res +} + +; CHECK-LABEL: test2 +; CHECK: vmovdqu8 +; CHECK: ret +define void @test2(i8 * %addr, <64 x i8> %data) { + %vaddr = bitcast i8* %addr to <64 x i8>* + store <64 x i8>%data, <64 x i8>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test3 +; CHECK: vmovdqu8{{.*{%k[1-7]}}} +; CHECK: ret +define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) { + %mask = icmp ne <64 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <64 x i8>* + %r = load <64 x i8>* %vaddr, align 1 + %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old + ret <64 x i8>%res +} + +; CHECK-LABEL: test4 +; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) { + %mask = icmp ne <64 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <64 x i8>* + %r = load <64 x i8>* %vaddr, align 1 + %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer + ret <64 x i8>%res +} + +; CHECK-LABEL: test5 +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test5(i8 * %addr) { + %vaddr = bitcast i8* %addr to <32 x i16>* + %res = load <32 x i16>* %vaddr, align 1 + ret <32 x i16>%res +} + +; CHECK-LABEL: test6 +; CHECK: vmovdqu16 +; CHECK: ret +define void @test6(i8 * %addr, <32 x i16> %data) { + %vaddr = bitcast i8* %addr to <32 x i16>* + store <32 x i16>%data, <32 x i16>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test7 +; CHECK: vmovdqu16{{.*{%k[1-7]}}} +; CHECK: ret +define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i16>* + %r = load <32 x i16>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old + ret <32 x i16>%res +} + +; CHECK-LABEL: test8 +; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i16>* + %r = load <32 x i16>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer + ret <32 x i16>%res +} diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll new file mode 100644 index 0000000..d2b1724 --- /dev/null +++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp eq <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: test2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp sgt <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: @test3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind { + %mask = icmp sge <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y + ret <32 x i16> %max +} + +; CHECK-LABEL: test4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp ugt <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: test5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind { + %y = load <32 x i16>* %yp, align 4 + %mask = icmp eq <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp sgt <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp sle <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp ule <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test9 +; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind { + %mask1 = icmp eq <32 x i16> %x1, %y1 + %mask0 = icmp eq <32 x i16> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %y + ret <32 x i16> %max +} + +; CHECK-LABEL: @test10 +; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind { + %mask1 = icmp sge <64 x i8> %x1, %y1 + %mask0 = icmp sle <64 x i8> %x, %y + %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1 + ret <64 x i8> %max +} + +; CHECK-LABEL: @test11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind { + %mask1 = icmp sgt <64 x i8> %x1, %y1 + %y = load <64 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <64 x i8> %x, %y + %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1 + ret <64 x i8> %max +} + +; CHECK-LABEL: @test12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind { + %mask1 = icmp sge <32 x i16> %x1, %y1 + %y = load <32 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <32 x i16> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll new file mode 100644 index 0000000..45f8d6d --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -0,0 +1,613 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; 256-bit + +define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32) + +define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16) + +define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_pcmpgt_b_256 +; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_b_256 +; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32) + +define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_pcmpgt_w_256 +; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_w_256 +; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16) + +define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK_LABEL: test_cmp_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_cmp_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone + +define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK_LABEL: test_ucmp_b_256 +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_ucmp_b_256 +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone + +define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK_LABEL: test_cmp_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmplew %ymm1, %ymm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_cmp_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK_LABEL: test_ucmp_w_256 +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_ucmp_w_256 +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone + +; 128-bit + +define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16) + +define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8) + +define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_pcmpgt_b_128 +; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_b_128 +; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16) + +define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_pcmpgt_w_128 +; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_w_128 +; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8) + +define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK_LABEL: test_cmp_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_cmp_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK_LABEL: test_ucmp_b_128 +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_ucmp_b_128 +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone + +define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK_LABEL: test_cmp_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmplew %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK_LABEL: test_ucmp_w_128 +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_w_128 +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll new file mode 100644 index 0000000..835844f --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-mov.ll @@ -0,0 +1,162 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; CHECK-LABEL: test_256_1 +; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62 +; CHECK: ret +define <32 x i8> @test_256_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <32 x i8>* + %res = load <32 x i8>* %vaddr, align 1 + ret <32 x i8>%res +} + +; CHECK-LABEL: test_256_2 +; CHECK: vmovdqu8{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_2(i8 * %addr, <32 x i8> %data) { + %vaddr = bitcast i8* %addr to <32 x i8>* + store <32 x i8>%data, <32 x i8>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_3 +; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) { + %mask = icmp ne <32 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i8>* + %r = load <32 x i8>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> %old + ret <32 x i8>%res +} + +; CHECK-LABEL: test_256_4 +; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) { + %mask = icmp ne <32 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i8>* + %r = load <32 x i8>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> zeroinitializer + ret <32 x i8>%res +} + +; CHECK-LABEL: test_256_5 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define <16 x i16> @test_256_5(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x i16>* + %res = load <16 x i16>* %vaddr, align 1 + ret <16 x i16>%res +} + +; CHECK-LABEL: test_256_6 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_6(i8 * %addr, <16 x i16> %data) { + %vaddr = bitcast i8* %addr to <16 x i16>* + store <16 x i16>%data, <16 x i16>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_7 +; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i16>* + %r = load <16 x i16>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> %old + ret <16 x i16>%res +} + +; CHECK-LABEL: test_256_8 +; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i16>* + %r = load <16 x i16>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> zeroinitializer + ret <16 x i16>%res +} + +; CHECK-LABEL: test_128_1 +; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62 +; CHECK: ret +define <16 x i8> @test_128_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x i8>* + %res = load <16 x i8>* %vaddr, align 1 + ret <16 x i8>%res +} + +; CHECK-LABEL: test_128_2 +; CHECK: vmovdqu8{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_2(i8 * %addr, <16 x i8> %data) { + %vaddr = bitcast i8* %addr to <16 x i8>* + store <16 x i8>%data, <16 x i8>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_3 +; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) { + %mask = icmp ne <16 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i8>* + %r = load <16 x i8>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> %old + ret <16 x i8>%res +} + +; CHECK-LABEL: test_128_4 +; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) { + %mask = icmp ne <16 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i8>* + %r = load <16 x i8>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> zeroinitializer + ret <16 x i8>%res +} + +; CHECK-LABEL: test_128_5 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define <8 x i16> @test_128_5(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i16>* + %res = load <8 x i16>* %vaddr, align 1 + ret <8 x i16>%res +} + +; CHECK-LABEL: test_128_6 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_6(i8 * %addr, <8 x i16> %data) { + %vaddr = bitcast i8* %addr to <8 x i16>* + store <8 x i16>%data, <8 x i16>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_7 +; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i16>* + %r = load <8 x i16>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> %old + ret <8 x i16>%res +} + +; CHECK-LABEL: test_128_8 +; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i16>* + %r = load <8 x i16>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> zeroinitializer + ret <8 x i16>%res +} + diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll new file mode 100644 index 0000000..2d13a16 --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll @@ -0,0 +1,269 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test256_1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind { + %mask = icmp eq <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y + ret <32 x i8> %max +} + +; CHECK-LABEL: test256_2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind { + %mask = icmp sgt <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind { + %mask = icmp sge <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y + ret <16 x i16> %max +} + +; CHECK-LABEL: test256_4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind { + %mask = icmp ugt <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: test256_5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind { + %y = load <16 x i16>* %yp, align 4 + %mask = icmp eq <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp sgt <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp sle <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp ule <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_9 +; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind { + %mask1 = icmp eq <16 x i16> %x1, %y1 + %mask0 = icmp eq <16 x i16> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_10 +; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind { + %mask1 = icmp sge <32 x i8> %x1, %y1 + %mask0 = icmp sle <32 x i8> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind { + %mask1 = icmp sgt <32 x i8> %x1, %y1 + %y = load <32 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <32 x i8> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind { + %mask1 = icmp sge <16 x i16> %x1, %y1 + %y = load <16 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <16 x i16> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: test128_1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind { + %mask = icmp eq <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y + ret <16 x i8> %max +} + +; CHECK-LABEL: test128_2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind { + %mask = icmp sgt <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind { + %mask = icmp sge <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y + ret <8 x i16> %max +} + +; CHECK-LABEL: test128_4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind { + %mask = icmp ugt <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: test128_5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind { + %y = load <8 x i16>* %yp, align 4 + %mask = icmp eq <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp sgt <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp sle <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp ule <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_9 +; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind { + %mask1 = icmp eq <8 x i16> %x1, %y1 + %mask0 = icmp eq <8 x i16> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_10 +; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind { + %mask1 = icmp sge <16 x i8> %x1, %y1 + %mask0 = icmp sle <16 x i8> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind { + %mask1 = icmp sgt <16 x i8> %x1, %y1 + %y = load <16 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <16 x i8> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind { + %mask1 = icmp sge <8 x i16> %x1, %y1 + %y = load <8 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <8 x i16> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll new file mode 100644 index 0000000..32a2633 --- /dev/null +++ b/test/CodeGen/X86/avx512dq-mask-op.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +define i8 @mask8(i8 %x) { + %m0 = bitcast i8 %x to <8 x i1> + %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <8 x i1> %m1 to i8 + ret i8 %ret +; CHECK: mask8 +; CHECK: knotb +; CHECK: ret +} + +define void @mask8_mem(i8* %ptr) { + %x = load i8* %ptr, align 4 + %m0 = bitcast i8 %x to <8 x i1> + %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <8 x i1> %m1 to i8 + store i8 %ret, i8* %ptr, align 4 + ret void +; CHECK-LABEL: mask8_mem +; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotb +; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]]) +; CHECK: ret +} + +define i8 @mand8(i8 %x, i8 %y) { + %ma = bitcast i8 %x to <8 x i1> + %mb = bitcast i8 %y to <8 x i1> + %mc = and <8 x i1> %ma, %mb + %md = xor <8 x i1> %ma, %mb + %me = or <8 x i1> %mc, %md + %ret = bitcast <8 x i1> %me to i8 +; CHECK: kandb +; CHECK: kxorb +; CHECK: korb + ret i8 %ret +} diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll new file mode 100644 index 0000000..0000ece --- /dev/null +++ b/test/CodeGen/X86/avx512er-intrinsics.ll @@ -0,0 +1,79 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s + +define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) { + ; CHECK: kmovw + ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: kmovw + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) + ret <16 x float> %res +} + + +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { + ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { + ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] + %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_exp2_ps_512(<16 x float> %a0) { + ; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0] + %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_exp2_pd_512(<8 x double> %a0) { + ; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0] + %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { + ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_rcp28_ss(<4 x float> %a0) { + ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll new file mode 100644 index 0000000..fa19084 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -0,0 +1,613 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; 256-bit + +define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8) + +define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8) + +define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_pcmpgt_d_256 +; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_d_256 +; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8) + +define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_pcmpgt_q_256 +; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_q_256 +; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8) + +define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK_LABEL: test_cmp_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK_LABEL: test_ucmp_d_256 +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_d_256 +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK_LABEL: test_cmp_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK_LABEL: test_ucmp_q_256 +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_q_256 +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone + +; 128-bit + +define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8) + +define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8) + +define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_pcmpgt_d_128 +; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_d_128 +; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8) + +define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_pcmpgt_q_128 +; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_q_128 +; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8) + +define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK_LABEL: test_cmp_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK_LABEL: test_ucmp_d_128 +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_d_128 +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK_LABEL: test_cmp_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK_LABEL: test_ucmp_q_128 +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_q_128 +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll new file mode 100644 index 0000000..3224656 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-mov.ll @@ -0,0 +1,642 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; CHECK-LABEL: test_256_1 +; CHECK: vmovdqu32 +; CHECK: ret +define <8 x i32> @test_256_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>* %vaddr, align 1 + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_2 +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test_256_2(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>* %vaddr, align 32 + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_3 +; CHECK: vmovdqa64 +; CHECK: ret +define void @test_256_3(i8 * %addr, <4 x i64> %data) { + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_4 +; CHECK: vmovdqu32 +; CHECK: ret +define void @test_256_4(i8 * %addr, <8 x i32> %data) { + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_5 +; CHECK: vmovdqa32 +; CHECK: ret +define void @test_256_5(i8 * %addr, <8 x i32> %data) { + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_6 +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test_256_6(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>* %vaddr, align 32 + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_7 +; CHECK: vmovdqu64 +; CHECK: ret +define void @test_256_7(i8 * %addr, <4 x i64> %data) { + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_8 +; CHECK: vmovdqu64 +; CHECK: ret +define <4 x i64> @test_256_8(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>* %vaddr, align 1 + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_9 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_9(i8 * %addr, <4 x double> %data) { + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_10 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x double> @test_256_10(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>* %vaddr, align 32 + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_11 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_11(i8 * %addr, <8 x float> %data) { + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_12 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define <8 x float> @test_256_12(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>* %vaddr, align 32 + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_13 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_13(i8 * %addr, <4 x double> %data) { + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_14 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x double> @test_256_14(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>* %vaddr, align 1 + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_15 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_15(i8 * %addr, <8 x float> %data) { + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_16 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define <8 x float> @test_256_16(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>* %vaddr, align 1 + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_17 +; CHECK: vmovdqa32{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_18 +; CHECK: vmovdqu32{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_19 +; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_20 +; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_21 +; CHECK: vmovdqa64{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_22 +; CHECK: vmovdqu64{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_23 +; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_24 +; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_25 +; CHECK: vmovaps{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_26 +; CHECK: vmovups{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_27 +; CHECK: vmovaps{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_28 +; CHECK: vmovups{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_29 +; CHECK: vmovapd{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_30 +; CHECK: vmovupd{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_31 +; CHECK: vmovapd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_32 +; CHECK: vmovupd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer + ret <4 x double>%res +} + +; CHECK-LABEL: test_128_1 +; CHECK: vmovdqu32 +; CHECK: ret +define <4 x i32> @test_128_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i32>* + %res = load <4 x i32>* %vaddr, align 1 + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_2 +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test_128_2(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i32>* + %res = load <4 x i32>* %vaddr, align 16 + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_3 +; CHECK: vmovdqa64 +; CHECK: ret +define void @test_128_3(i8 * %addr, <2 x i64> %data) { + %vaddr = bitcast i8* %addr to <2 x i64>* + store <2 x i64>%data, <2 x i64>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_4 +; CHECK: vmovdqu32 +; CHECK: ret +define void @test_128_4(i8 * %addr, <4 x i32> %data) { + %vaddr = bitcast i8* %addr to <4 x i32>* + store <4 x i32>%data, <4 x i32>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_5 +; CHECK: vmovdqa32 +; CHECK: ret +define void @test_128_5(i8 * %addr, <4 x i32> %data) { + %vaddr = bitcast i8* %addr to <4 x i32>* + store <4 x i32>%data, <4 x i32>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_6 +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test_128_6(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x i64>* + %res = load <2 x i64>* %vaddr, align 16 + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_7 +; CHECK: vmovdqu64 +; CHECK: ret +define void @test_128_7(i8 * %addr, <2 x i64> %data) { + %vaddr = bitcast i8* %addr to <2 x i64>* + store <2 x i64>%data, <2 x i64>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_8 +; CHECK: vmovdqu64 +; CHECK: ret +define <2 x i64> @test_128_8(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x i64>* + %res = load <2 x i64>* %vaddr, align 1 + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_9 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_9(i8 * %addr, <2 x double> %data) { + %vaddr = bitcast i8* %addr to <2 x double>* + store <2 x double>%data, <2 x double>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_10 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <2 x double> @test_128_10(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x double>* + %res = load <2 x double>* %vaddr, align 16 + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_11 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_11(i8 * %addr, <4 x float> %data) { + %vaddr = bitcast i8* %addr to <4 x float>* + store <4 x float>%data, <4 x float>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_12 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x float> @test_128_12(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x float>* + %res = load <4 x float>* %vaddr, align 16 + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_13 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_13(i8 * %addr, <2 x double> %data) { + %vaddr = bitcast i8* %addr to <2 x double>* + store <2 x double>%data, <2 x double>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_14 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <2 x double> @test_128_14(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x double>* + %res = load <2 x double>* %vaddr, align 1 + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_15 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_15(i8 * %addr, <4 x float> %data) { + %vaddr = bitcast i8* %addr to <4 x float>* + store <4 x float>%data, <4 x float>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_16 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x float> @test_128_16(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x float>* + %res = load <4 x float>* %vaddr, align 1 + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_17 +; CHECK: vmovdqa32{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_18 +; CHECK: vmovdqu32{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_19 +; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_20 +; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_21 +; CHECK: vmovdqa64{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_22 +; CHECK: vmovdqu64{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_23 +; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_24 +; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_25 +; CHECK: vmovaps{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_26 +; CHECK: vmovups{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_27 +; CHECK: vmovaps{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_28 +; CHECK: vmovups{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_29 +; CHECK: vmovapd{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_30 +; CHECK: vmovupd{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_31 +; CHECK: vmovapd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_32 +; CHECK: vmovupd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer + ret <2 x double>%res +} + diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll new file mode 100644 index 0000000..2ad9768 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-nontemporal.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s + +define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) { +; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62 + %cast = bitcast i8* %B to <8 x float>* + %A2 = fadd <8 x float> %A, %AA + store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0 +; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62 + %cast1 = bitcast i8* %B to <4 x i64>* + %E2 = add <4 x i64> %E, %EE + store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0 +; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62 + %cast2 = bitcast i8* %B to <4 x double>* + %C2 = fadd <4 x double> %C, %CC + store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0 + ret void +} + +define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) { +; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62 + %cast = bitcast i8* %B to <4 x float>* + %A2 = fadd <4 x float> %A, %AA + store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0 +; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62 + %cast1 = bitcast i8* %B to <2 x i64>* + %E2 = add <2 x i64> %E, %EE + store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0 +; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62 + %cast2 = bitcast i8* %B to <2 x double>* + %C2 = fadd <2 x double> %C, %CC + store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0 + ret void +} +!0 = metadata !{i32 1} diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll new file mode 100644 index 0000000..9c64c03 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -0,0 +1,381 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test256_1 +; CHECK: vpcmpeqq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp eq <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_2 +; CHECK: vpcmpgtq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp sgt <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_3 +; CHECK: vpcmpled {{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind { + %mask = icmp sge <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_4 +; CHECK: vpcmpnleuq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp ugt <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_5 +; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { + %y = load <8 x i32>* %yp, align 4 + %mask = icmp eq <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_6 +; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp sgt <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_7 +; CHECK: vpcmpled (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp sle <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_8 +; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp ule <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_9 +; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp eq <8 x i32> %x1, %y1 + %mask0 = icmp eq <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_10 +; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sge <4 x i64> %x1, %y1 + %mask0 = icmp sle <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_11 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sgt <4 x i64> %x1, %y1 + %y = load <4 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_12 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp sge <8 x i32> %x1, %y1 + %y = load <8 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_13 +; CHECK: vpcmpeqq (%rdi){1to4}, %ymm +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer + %mask = icmp eq <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_14 +; CHECK: vpcmpled (%rdi){1to8}, %ymm +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer + %mask = icmp sle <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_15 +; CHECK: vpcmpgtd (%rdi){1to8}, %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp sge <8 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer + %mask0 = icmp sgt <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_16 +; CHECK: vpcmpgtq (%rdi){1to4}, %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sge <4 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer + %mask0 = icmp sgt <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: test128_1 +; CHECK: vpcmpeqq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp eq <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_2 +; CHECK: vpcmpgtq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp sgt <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_3 +; CHECK: vpcmpled {{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind { + %mask = icmp sge <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_4 +; CHECK: vpcmpnleuq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp ugt <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_5 +; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { + %y = load <4 x i32>* %yp, align 4 + %mask = icmp eq <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_6 +; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp sgt <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_7 +; CHECK: vpcmpled (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp sle <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_8 +; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp ule <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_9 +; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp eq <4 x i32> %x1, %y1 + %mask0 = icmp eq <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_10 +; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sge <2 x i64> %x1, %y1 + %mask0 = icmp sle <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_11 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sgt <2 x i64> %x1, %y1 + %y = load <2 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_12 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp sge <4 x i32> %x1, %y1 + %y = load <4 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_13 +; CHECK: vpcmpeqq (%rdi){1to2}, %xmm +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 + %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 + %mask = icmp eq <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_14 +; CHECK: vpcmpled (%rdi){1to4}, %xmm +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer + %mask = icmp sle <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_15 +; CHECK: vpcmpgtd (%rdi){1to4}, %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp sge <4 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer + %mask0 = icmp sgt <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_16 +; CHECK: vpcmpgtq (%rdi){1to2}, %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sge <2 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 + %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 + %mask0 = icmp sgt <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll deleted file mode 100644 index 34aaf2c..0000000 --- a/test/CodeGen/X86/blend-msb.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s - - -; Verify that we produce movss instead of blendvps when possible. - -;CHECK-LABEL: vsel_float: -;CHECK-NOT: blend -;CHECK: movss -;CHECK: ret -define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2 - ret <4 x float> %vsel -} - -;CHECK-LABEL: vsel_4xi8: -;CHECK-NOT: blend -;CHECK: movss -;CHECK: ret -define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2 - ret <4 x i8> %vsel -} - -;CHECK-LABEL: vsel_8xi16: -; The select mask is -; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false> -; which translates into the boolean mask (big endian representation): -; 00010001 = 17. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; the inverted mask: 11101110 = 238. -; According to the ABI: -; v1 is in xmm0 => first argument is xmm0. -; v2 is in xmm1 => second argument is xmm1. -;CHECK: pblendw $238, %xmm1, %xmm0 -;CHECK: ret -define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2 - ret <8 x i16> %vsel -} diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll index 2681c10..cc40bcf 100644 --- a/test/CodeGen/X86/block-placement.ll +++ b/test/CodeGen/X86/block-placement.ll @@ -237,44 +237,6 @@ exit: ret i32 %base } -define void @test_loop_rotate_reversed_blocks() { -; This test case (greatly reduced from an Olden bencmark) ensures that the loop -; rotate implementation doesn't assume that loops are laid out in a particular -; order. The first loop will get split into two basic blocks, with the loop -; header coming after the loop latch. -; -; CHECK: test_loop_rotate_reversed_blocks -; CHECK: %entry -; Look for a jump into the middle of the loop, and no branches mid-way. -; CHECK: jmp -; CHECK: %loop1 -; CHECK-NOT: j{{\w*}} .LBB{{.*}} -; CHECK: %loop1 -; CHECK: je - -entry: - %cond1 = load volatile i1* undef - br i1 %cond1, label %loop2.preheader, label %loop1 - -loop1: - call i32 @f() - %cond2 = load volatile i1* undef - br i1 %cond2, label %loop2.preheader, label %loop1 - -loop2.preheader: - call i32 @f() - %cond3 = load volatile i1* undef - br i1 %cond3, label %exit, label %loop2 - -loop2: - call i32 @f() - %cond4 = load volatile i1* undef - br i1 %cond4, label %exit, label %loop2 - -exit: - ret void -} - define i32 @test_loop_align(i32 %i, i32* %a) { ; Check that we provide basic loop body alignment with the block placement ; pass. diff --git a/test/CodeGen/X86/byval-callee-cleanup.ll b/test/CodeGen/X86/byval-callee-cleanup.ll new file mode 100644 index 0000000..8e059d4 --- /dev/null +++ b/test/CodeGen/X86/byval-callee-cleanup.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s + +; Previously we would forget to align to stack slot alignment after placing a +; byval argument. Subsequent arguments would align themselves, but if it was +; the last argument, the argument size would not be a multiple of stack slot +; size. This resulted in retl $6 in callee-cleanup functions, as well as subtle +; varargs bugs. + +%struct.Six = type { [6 x i8] } + +define x86_stdcallcc void @f(%struct.Six* byval %a) { + ret void +} +; CHECK-LABEL: _f@8: +; CHECK: retl $8 + +define x86_thiscallcc void @g(i8* %this, %struct.Six* byval %a) { + ret void +} +; CHECK-LABEL: _g: +; CHECK: retl $8 + +define x86_fastcallcc void @h(i32 inreg %x, i32 inreg %y, %struct.Six* byval %a) { + ret void +} +; CHECK-LABEL: @h@16: +; CHECK: retl $8 diff --git a/test/CodeGen/X86/cfi_enforcing.ll b/test/CodeGen/X86/cfi_enforcing.ll new file mode 100644 index 0000000..bcad8c1 --- /dev/null +++ b/test/CodeGen/X86/cfi_enforcing.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=i386-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86-64 %s + +define void @indirect_fun() unnamed_addr jumptable { + ret void +} + +define i32 @m(void ()* %fun) { + call void ()* %fun() +; CHECK: subl +; X86-64: andq $8, +; X86-64: leaq __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]] +; X86-64-NOT: callq __llvm_cfi_pointer_warning +; X86-64: callq *[[REG]] +; X86: andl $8, +; X86: leal __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]] +; X86-NOT: calll __llvm_cfi_pointer_warning +; X86: calll *[[REG]] + ret i32 0 +} + +define void ()* @get_fun() { + ret void ()* @indirect_fun +} + +define i32 @main(i32 %argc, i8** %argv) { + %f = call void ()* ()* @get_fun() + %a = call i32 @m(void ()* %f) + ret i32 %a +} + +; CHECK: .align 8 +; CHECK: __llvm_jump_instr_table_0_1: +; CHECK: jmp indirect_fun@PLT diff --git a/test/CodeGen/X86/cfi_invoke.ll b/test/CodeGen/X86/cfi_invoke.ll new file mode 100644 index 0000000..dd0d42a --- /dev/null +++ b/test/CodeGen/X86/cfi_invoke.ll @@ -0,0 +1,35 @@ +; RUN: llc <%s -fcfi -cfi-type=sub | FileCheck %s +; ModuleID = 'test.cc' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i32 @__gxx_personality_v0(...) + +@_ZTIPKc = external constant i8* +@_ZTIi = external constant i8* + +define void @f() unnamed_addr jumptable { + ret void +} + +@a = global void ()* @f + +; Make sure invoke gets targeted as well as regular calls +define void @_Z3foov(void ()* %f) uwtable ssp { +; CHECK-LABEL: _Z3foov: + entry: + invoke void %f() + to label %try.cont unwind label %lpad +; CHECK: callq __llvm_cfi_pointer_warning +; CHECK: callq *%rbx + + lpad: + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* bitcast (i8** @_ZTIi to i8*) + filter [1 x i8*] [i8* bitcast (i8** @_ZTIPKc to i8*)] + ret void + + try.cont: + ret void +} + diff --git a/test/CodeGen/X86/cfi_non_default_function.ll b/test/CodeGen/X86/cfi_non_default_function.ll new file mode 100644 index 0000000..29774a1 --- /dev/null +++ b/test/CodeGen/X86/cfi_non_default_function.ll @@ -0,0 +1,27 @@ +; RUN: llc -fcfi -cfi-func-name=cfi_new_failure <%s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" +define void @indirect_fun() unnamed_addr jumptable { + ret void +} + +define i32 @m(void ()* %fun) { +; CHECK-LABEL: @m + call void ()* %fun() +; CHECK: callq cfi_new_failure + ret i32 0 +} + +define void ()* @get_fun() { + ret void ()* @indirect_fun +} + +define i32 @main(i32 %argc, i8** %argv) { + %f = call void ()* ()* @get_fun() + %a = call i32 @m(void ()* %f) + ret i32 %a +} + +; CHECK: .align 8 +; CHECK: __llvm_jump_instr_table_0_1: +; CHECK: jmp indirect_fun@PLT diff --git a/test/CodeGen/X86/cfi_simple_indirect_call.ll b/test/CodeGen/X86/cfi_simple_indirect_call.ll new file mode 100644 index 0000000..0ee118d --- /dev/null +++ b/test/CodeGen/X86/cfi_simple_indirect_call.ll @@ -0,0 +1,43 @@ +; RUN: llc -fcfi -cfi-type=sub <%s | FileCheck --check-prefix=SUB %s +; RUN: llc -fcfi -cfi-type=add <%s | FileCheck --check-prefix=ADD %s +; RUN: llc -fcfi -cfi-type=ror <%s | FileCheck --check-prefix=ROR %s + +target triple = "x86_64-unknown-linux-gnu" + +define void @indirect_fun() unnamed_addr jumptable { + ret void +} + +define i32 @m(void ()* %fun) { + call void ()* %fun() +; SUB: subl +; SUB: andq $8 +; SUB-LABEL: leaq __llvm_jump_instr_table_0_1 +; SUB-LABEL: callq __llvm_cfi_pointer_warning + +; ROR: subq +; ROR: rolq $61 +; ROR: testq +; ROR-LABEL: callq __llvm_cfi_pointer_warning + +; ADD: andq $8 +; ADD-LABEL: leaq __llvm_jump_instr_table_0_1 +; ADD: cmpq +; ADD-LABEL: callq __llvm_cfi_pointer_warning +ret i32 0 +} + +define void ()* @get_fun() { + ret void ()* @indirect_fun +} + +define i32 @main(i32 %argc, i8** %argv) { + %f = call void ()* ()* @get_fun() + %a = call i32 @m(void ()* %f) + ret i32 %a +} +; SUB: .text +; SUB: .align 8 +; SUB-LABEL: .type __llvm_jump_instr_table_0_1,@function +; SUB-LABEL:__llvm_jump_instr_table_0_1: +; SUB-LABEL: jmp indirect_fun@PLT diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll new file mode 100644 index 0000000..3cb8b97 --- /dev/null +++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll @@ -0,0 +1,86 @@ +; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s + +declare i32 @bar() + +define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) { +; CHECK-LABEL: test_intervening_call: +; CHECK: cmpxchg +; CHECK: pushfq +; CHECK: popq [[FLAGS:%.*]] + +; CHECK: callq bar + +; CHECK: pushq [[FLAGS]] +; CHECK: popfq +; CHECK: jne + %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst + %p = extractvalue { i64, i1 } %cx, 1 + call i32 @bar() + br i1 %p, label %t, label %f + +t: + ret i64 42 + +f: + ret i64 0 +} + +; Interesting in producing a clobber without any function calls. +define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) { +; CHECK-LABEL: test_control_flow: + +; CHECK: cmpxchg +; CHECK-NEXT: jne +entry: + %cmp = icmp sgt i32 %i, %j + br i1 %cmp, label %loop_start, label %cond.end + +loop_start: + br label %while.condthread-pre-split.i + +while.condthread-pre-split.i: + %.pr.i = load i32* %p, align 4 + br label %while.cond.i + +while.cond.i: + %0 = phi i32 [ %.pr.i, %while.condthread-pre-split.i ], [ 0, %while.cond.i ] + %tobool.i = icmp eq i32 %0, 0 + br i1 %tobool.i, label %while.cond.i, label %while.body.i + +while.body.i: + %.lcssa = phi i32 [ %0, %while.cond.i ] + %1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst + %2 = extractvalue { i32, i1 } %1, 1 + br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i + +cond.end.loopexit: + br label %cond.end + +cond.end: + %cond = phi i32 [ %i, %entry ], [ 0, %cond.end.loopexit ] + ret i32 %cond +} + +; This one is an interesting case because CMOV doesn't have a chain +; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here. +define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) { +; CHECK-LABEL: test_feed_cmov: + +; CHECK: cmpxchg +; CHECK: pushfq +; CHECK: popq [[FLAGS:%.*]] + +; CHECK: callq bar + +; CHECK: pushq [[FLAGS]] +; CHECK: popfq + + %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst + %success = extractvalue { i32, i1 } %res, 1 + + %rhs = call i32 @bar() + + %ret = select i1 %success, i32 %new, i32 %rhs + ret i32 %ret +} diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll index 78e1dd2..85bfff2 100644 --- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll +++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s ; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s ; This file tests the different cases what are involved when codegen prepare -; tries to get sign extension out of the way of addressing mode. +; tries to get sign/zero extension out of the way of addressing mode. ; This tests require an actual target as addressing mode decisions depends ; on the target. @@ -67,6 +67,43 @@ define i8 @oneArgPromotion(i32 %arg1, i8* %base) { ret i8 %res } +; Check that we are able to merge a sign extension with a zero extension. +; CHECK-LABEL: @oneArgPromotionZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionZExt(i8 %arg1, i8* %base) { + %zext = zext i8 %arg1 to i32 + %add = add nsw i32 %zext, 1 + %sextadd = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; When promoting a constant zext, the IR builder returns a constant, +; not an instruction. Make sure this is properly handled. This used +; to crash. +; Note: The constant zext is promoted, but does not help matching +; more thing in the addressing mode. Therefore the modification is +; rolled back. +; Still, this test case exercises the desired code path. +; CHECK-LABEL: @oneArgPromotionCstZExt +; CHECK: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i16 undef to i32 +; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXT]] to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionCstZExt(i8* %base) { + %cst = zext i16 undef to i32 + %add = add nsw i32 %cst, 1 + %sextadd = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd + %res = load i8* %arrayidx + ret i8 %res +} + ; Check that we do not promote truncate when we cannot determine the ; bits that are dropped. ; CHECK-LABEL: @oneArgPromotionBlockTrunc1 @@ -321,3 +358,177 @@ end: %final = load i32* %addr ret i32 %final } + +%struct.dns_packet = type { i32, i32, %union.anon } +%union.anon = type { i32 } + +@a = common global i32 0, align 4 +@b = common global i16 0, align 2 + +; We used to crash on this function because we did not return the right +; promoted instruction for %conv.i. +; Make sure we generate the right code now. +; CHECK-LABEL: @fn3 +; %conv.i is used twice and only one of its use is being promoted. +; Use it at the starting point for the matching. +; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32 +; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64 +; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = ptrtoint %struct.dns_packet* %P to i64 +; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add i64 [[BASE]], [[PROMOTED_CONV]] +; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = add i64 [[ADD]], 7 +; CHECK-NEXT: [[CAST:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[ADDR]] to i8* +; CHECK-NEXT: load i8* [[CAST]], align 1 +define signext i16 @fn3(%struct.dns_packet* nocapture readonly %P) { +entry: + %tmp = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 2 + %data.i.i = bitcast %union.anon* %tmp to [0 x i8]* + br label %while.body.i.i + +while.body.i.i: ; preds = %while.body.i.i, %entry + %src.addr.0.i.i = phi i16 [ 0, %entry ], [ %inc.i.i, %while.body.i.i ] + %inc.i.i = add i16 %src.addr.0.i.i, 1 + %idxprom.i.i = sext i16 %src.addr.0.i.i to i64 + %arrayidx.i.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i.i + %tmp1 = load i8* %arrayidx.i.i, align 1 + %conv2.i.i = zext i8 %tmp1 to i32 + %and.i.i = and i32 %conv2.i.i, 15 + store i32 %and.i.i, i32* @a, align 4 + %tobool.i.i = icmp eq i32 %and.i.i, 0 + br i1 %tobool.i.i, label %while.body.i.i, label %fn1.exit.i + +fn1.exit.i: ; preds = %while.body.i.i + %inc.i.i.lcssa = phi i16 [ %inc.i.i, %while.body.i.i ] + %conv.i = zext i16 %inc.i.i.lcssa to i32 + %sub.i = add nsw i32 %conv.i, -1 + %idxprom.i = sext i32 %sub.i to i64 + %arrayidx.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i + %tmp2 = load i8* %arrayidx.i, align 1 + %conv2.i = sext i8 %tmp2 to i16 + store i16 %conv2.i, i16* @b, align 2 + %sub4.i = sub nsw i32 0, %conv.i + %conv5.i = zext i16 %conv2.i to i32 + %cmp.i = icmp sgt i32 %conv5.i, %sub4.i + br i1 %cmp.i, label %if.then.i, label %fn2.exit + +if.then.i: ; preds = %fn1.exit.i + %end.i = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 1 + %tmp3 = load i32* %end.i, align 4 + %sub7.i = add i32 %tmp3, 65535 + %conv8.i = trunc i32 %sub7.i to i16 + br label %fn2.exit + +fn2.exit: ; preds = %if.then.i, %fn1.exit.i + %retval.0.i = phi i16 [ %conv8.i, %if.then.i ], [ undef, %fn1.exit.i ] + ret i16 %retval.0.i +} + +; Check that we do not promote an extension if the non-wrapping flag does not +; match the kind of the extension. +; CHECK-LABEL: @noPromotionFlag +; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64 +; CHECK: inttoptr i64 [[PROMOTED]] to i8* +; CHECK: ret +define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) { + %add = add nsw i32 %arg1, %arg2 + %zextadd = zext i32 %add to i64 + %base = inttoptr i64 %zextadd to i8* + %res = load i8* %base + ret i8 %res +} + +; Check that we correctly promote both operands of the promotable add with zext. +; CHECK-LABEL: @twoArgsPromotionZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64 +; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]] +; CHECK: inttoptr i64 [[PROMOTED]] to i8* +; CHECK: ret +define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) { + %add = add nuw i32 %arg1, %arg2 + %zextadd = zext i32 %add to i64 + %base = inttoptr i64 %zextadd to i8* + %res = load i8* %base + ret i8 %res +} + +; Check that we correctly promote constant arguments. +; CHECK-LABEL: @oneArgPromotionNegativeCstZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, i8* %base) { + %add = add nuw i8 %arg1, -1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we are able to merge two zero extensions. +; CHECK-LABEL: @oneArgPromotionZExtZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionZExtZExt(i8 %arg1, i8* %base) { + %zext = zext i8 %arg1 to i32 + %add = add nuw i32 %zext, 1 + %zextadd = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we do not promote truncate when the dropped bits +; are of a different kind. +; CHECK-LABEL: @oneArgPromotionBlockTruncZExt +; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32 +; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8 +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, i8* %base) { + %sextarg1 = sext i1 %arg1 to i32 + %trunc = trunc i32 %sextarg1 to i8 + %add = add nuw i8 %trunc, 1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we are able to promote truncate when we know all the bits +; that are dropped. +; CHECK-LABEL: @oneArgPromotionPassTruncZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, i8* %base) { + %sextarg1 = zext i1 %arg1 to i32 + %trunc = trunc i32 %sextarg1 to i8 + %add = add nuw i8 %trunc, 1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we do not promote sext with zext. +; CHECK-LABEL: @oneArgPromotionBlockSExtZExt +; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8 +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, i8* %base) { + %sextarg1 = sext i1 %arg1 to i8 + %add = add nuw i8 %sextarg1, 1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll index bf27b2f..ac4546d 100644 --- a/test/CodeGen/X86/coff-comdat.ll +++ b/test/CodeGen/X86/coff-comdat.ll @@ -73,19 +73,19 @@ $vftable = comdat largest ; CHECK: .globl @v8@0 ; CHECK: .section .text,"xr",discard,@f8@0 ; CHECK: .globl @f8@0 -; CHECK: .section .bss,"bw",associative,_f1 +; CHECK: .section .bss,"wb",associative,_f1 ; CHECK: .globl _v1 -; CHECK: .section .bss,"bw",associative,_f2 +; CHECK: .section .bss,"wb",associative,_f2 ; CHECK: .globl _v2 -; CHECK: .section .bss,"bw",associative,_f3 +; CHECK: .section .bss,"wb",associative,_f3 ; CHECK: .globl _v3 -; CHECK: .section .bss,"bw",associative,_f4 +; CHECK: .section .bss,"wb",associative,_f4 ; CHECK: .globl _v4 -; CHECK: .section .bss,"bw",associative,_f5 +; CHECK: .section .bss,"wb",associative,_f5 ; CHECK: .globl _v5 -; CHECK: .section .bss,"bw",associative,_f6 +; CHECK: .section .bss,"wb",associative,_f6 ; CHECK: .globl _v6 -; CHECK: .section .bss,"bw",same_size,_f6 +; CHECK: .section .bss,"wb",same_size,_f6 ; CHECK: .globl _f6 ; CHECK: .section .rdata,"rd",largest,_vftable ; CHECK: .globl _vftable diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll index 6744b5b..58bc04e 100644 --- a/test/CodeGen/X86/coff-comdat2.ll +++ b/test/CodeGen/X86/coff-comdat2.ll @@ -6,4 +6,4 @@ target triple = "i686-pc-windows-msvc" $foo = comdat largest @foo = global i32 0 @bar = global i32 0, comdat $foo -; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT. +; CHECK: Associative COMDAT symbol 'foo' is not a key for its COMDAT. diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll new file mode 100644 index 0000000..59a7a19 --- /dev/null +++ b/test/CodeGen/X86/combine-and.ll @@ -0,0 +1,164 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s +; +; Verify that the DAGCombiner is able to fold a vector AND into a blend +; if one of the operands to the AND is a vector of all constants, and each +; constant element is either zero or all-ones. + + +define <4 x i32> @test1(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test1 +; CHECK: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: retq + + +define <4 x i32> @test2(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test2 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test3(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test3 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test4(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test4 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test5(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test5 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test6(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test6 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test7(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test7 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test8(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test8 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test9(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test9 +; CHECK: movq %xmm0, %xmm0 +; CHECK-NEXT: retq + + +define <4 x i32> @test10(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test10 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test11(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test11 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test12(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test12 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test13(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test13 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test14(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test14 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1> + %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} +; CHECK-LABEL: test15 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0> + %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} +; CHECK-LABEL: test16 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1> + %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} +; CHECK-LABEL: test17 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index ff807b9..9539eae 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -5,265 +5,293 @@ ; instruction which performs a blend operation. define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test1 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test2 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test3 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NEXT: ret define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test4 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test5: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test5 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NEXT: ret define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test6: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test6 -; CHECK-NOT: xorps -; CHECK: blendps $12 -; CHECK-NEXT: ret define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test7: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0> %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test7 -; CHECK-NOT: xorps -; CHECK: blendps $12 -; CHECK-NEXT: ret define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test8: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <2 x i64> %a, <i64 -1, i64 0> %and2 = and <2 x i64> %b, <i64 0, i64 -1> %or = or <2 x i64> %and1, %and2 ret <2 x i64> %or } -; CHECK-LABEL: test8 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test9: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1> %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test9 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test10: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <2 x i64> %a, <i64 0, i64 -1> %and2 = and <2 x i64> %b, <i64 -1, i64 0> %or = or <2 x i64> %and1, %and2 ret <2 x i64> %or } -; CHECK-LABEL: test10 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NEXT: ret define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test11: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0> %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test11 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test12: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1> %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test12 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NEXT: ret ; Verify that the following test cases are folded into single shuffles. define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test13: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test13 -; CHECK-NOT: xorps -; CHECK: shufps -; CHECK-NEXT: ret define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test14: +; CHECK: # BB#0: +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test14 -; CHECK-NOT: pslldq -; CHECK-NOT: por -; CHECK: punpcklqdq -; CHECK-NEXT: ret define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test15: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test15 -; CHECK-NOT: xorps -; CHECK: shufps -; CHECK-NOT: shufps -; CHECK-NOT: orps -; CHECK: ret define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test16: +; CHECK: # BB#0: +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test16 -; CHECK-NOT: pslldq -; CHECK-NOT: por -; CHECK: punpcklqdq -; CHECK: ret ; Verify that the dag-combiner does not fold a OR of two shuffles into a single ; shuffle instruction when the shuffle indexes are not compatible. define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test17: +; CHECK: # BB#0: +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; CHECK-NEXT: orps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test17 -; CHECK: por -; CHECK-NEXT: ret define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test18: +; CHECK: # BB#0: +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test18 -; CHECK: orps -; CHECK: ret define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test19: +; CHECK: # BB#0: +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3] +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2] +; CHECK-NEXT: orps %xmm3, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test19 -; CHECK: por -; CHECK-NEXT: ret define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test20: +; CHECK: # BB#0: +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test20 -; CHECK-NOT: xorps -; CHECK: orps -; CHECK-NEXT: movq -; CHECK-NEXT: ret define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test21: +; CHECK: # BB#0: +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test21 -; CHECK: por -; CHECK-NEXT: pslldq -; CHECK-NEXT: ret +; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle +; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to +; handle legal vector value types. +define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: test_crash: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> + %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> + %or = or <4 x i8> %shuf1, %shuf2 + ret <4 x i8> %or +} diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll deleted file mode 100644 index 7ab7f80..0000000 --- a/test/CodeGen/X86/combine-vec-shuffle-2.ll +++ /dev/null @@ -1,164 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -; Check that DAGCombiner correctly folds the following pairs of shuffles -; using the following rules: -; 1. shuffle(shuffle(x, y), undef) -> x -; 2. shuffle(shuffle(x, y), undef) -> y -; 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) -; 4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y) -; -; Rules 3. and 4. are used only if the resulting shuffle mask is legal. - -define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test1 -; Mask: [3,0,0,1] -; CHECK: pshufd $67 -; CHECK-NEXT: ret - - -define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test2 -; Mask: [2,0,0,3] -; CHECK: pshufd $-62 -; CHECK-NEXT: ret - - -define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test3 -; Mask: [2,0,0,3] -; CHECK: pshufd $-62 -; CHECK-NEXT: ret - - -define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test4 -; Mask: [0,0,0,1] -; CHECK: pshufd $64 -; CHECK-NEXT: ret - - -define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test5 -; Mask: [1,1] -; CHECK: movhlps -; CHECK-NEXT: ret - - -define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test6 -; Mask: [2,0,0,0] -; CHECK: pshufd $2 -; CHECK-NEXT: ret - - -define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,2,0,2] -; CHECK: pshufd $-120 -; CHECK-NEXT: ret - - -define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test8 -; Mask: [1,0,3,0] -; CHECK: pshufd $49 -; CHECK-NEXT: ret - - -define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test9 -; Mask: [1,3,0,2] -; CHECK: pshufd $-115 -; CHECK-NEXT: ret - - -define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test10 -; Mask: [1,0,1,0] -; CHECK: pshufd $17 -; CHECK-NEXT: ret - - -define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> - ret <4 x i32> %2 -} -; CHECK-LABEL: test11 -; Mask: [1,0,2,1] -; CHECK: pshufd $97 -; CHECK-NEXT: ret - - -define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,0,0,0] -; CHECK: pshufd $0 -; CHECK-NEXT: ret - - -; The following pair of shuffles is folded into vector %A. -define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test13 -; CHECK-NOT: pshufd -; CHECK: ret - - -; The following pair of shuffles is folded into vector %B. -define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test14 -; CHECK-NOT: pshufd -; CHECK: ret - diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll deleted file mode 100644 index 9e6ab89..0000000 --- a/test/CodeGen/X86/combine-vec-shuffle.ll +++ /dev/null @@ -1,253 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -; Verify that the DAGCombiner correctly folds according to the following rules: - -; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C) -; fold (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C) -; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0) - -; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B)) -; fold (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B)) -; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B)) - - - -define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test1 -; CHECK-NOT: pshufd -; CHECK: pand -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test2 -; CHECK-NOT: pshufd -; CHECK: por -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test3 -; CHECK-NOT: pshufd -; CHECK: pxor -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test4 -; CHECK-NOT: pshufd -; CHECK: pand -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test5 -; CHECK-NOT: pshufd -; CHECK: por -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test6 -; CHECK-NOT: pshufd -; CHECK: pxor -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles -; are not performing a swizzle operations. - -define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test1b -; CHECK-NOT: blendps -; CHECK: andps -; CHECK-NEXT: blendps -; CHECK-NEXT: ret - - -define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test2b -; CHECK-NOT: blendps -; CHECK: orps -; CHECK-NEXT: blendps -; CHECK-NEXT: ret - - -define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test3b -; CHECK-NOT: blendps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: blendps -; CHECK-NEXT: ret - - -define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test4b -; CHECK-NOT: blendps -; CHECK: andps -; CHECK-NEXT: blendps -; CHECK: ret - - -define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test5b -; CHECK-NOT: blendps -; CHECK: orps -; CHECK-NEXT: blendps -; CHECK: ret - - -define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test6b -; CHECK-NOT: blendps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: blendps -; CHECK: ret - -define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test1c -; CHECK-NOT: shufps -; CHECK: andps -; CHECK-NEXT: shufps -; CHECK-NEXT: ret - - -define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test2c -; CHECK-NOT: shufps -; CHECK: orps -; CHECK-NEXT: shufps -; CHECK-NEXT: ret - - -define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test3c -; CHECK-NOT: shufps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: shufps -; CHECK-NEXT: ret - - -define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test4c -; CHECK-NOT: shufps -; CHECK: andps -; CHECK-NEXT: shufps -; CHECK: ret - - -define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test5c -; CHECK-NOT: shufps -; CHECK: orps -; CHECK-NEXT: shufps -; CHECK: ret - - -define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test6c -; CHECK-NOT: shufps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: shufps -; CHECK: ret - diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll new file mode 100644 index 0000000..d06c6da --- /dev/null +++ b/test/CodeGen/X86/commute-blend-avx2.ll @@ -0,0 +1,89 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s + +define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 { + %1 = load <8 x i16>* %b + %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) + ret <8 x i16> %2 + + ;LABEL: commute_fold_vpblendw_128 + ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 { + %1 = load <16 x i16>* %b + %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17) + ret <16 x i16> %2 + + ;LABEL: commute_fold_vpblendw_256 + ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] + ;CHECK-NEXT: retq +} +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 { + %1 = load <4 x i32>* %b + %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1) + ret <4 x i32> %2 + + ;LABEL: commute_fold_vpblendd_128 + ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] + ;CHECK-NEXT: retq +} +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 { + %1 = load <8 x i32>* %b + %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129) + ret <8 x i32> %2 + + ;LABEL: commute_fold_vpblendd_256 + ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7] + ;CHECK-NEXT: retq +} +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone + +define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 { + %1 = load <4 x float>* %b + %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3) + ret <4 x float> %2 + + ;LABEL: commute_fold_vblendps_128 + ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] + ;CHECK-NEXT: retq +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 { + %1 = load <8 x float>* %b + %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7) + ret <8 x float> %2 + + ;LABEL: commute_fold_vblendps_256 + ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 { + %1 = load <2 x double>* %b + %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) + ret <2 x double> %2 + + ;LABEL: commute_fold_vblendpd_128 + ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] + ;CHECK-NEXT: retq +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 { + %1 = load <4 x double>* %b + %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7) + ret <4 x double> %2 + + ;LABEL: commute_fold_vblendpd_256 + ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3] + ;CHECK-NEXT: retq +} +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll new file mode 100644 index 0000000..59fef8c --- /dev/null +++ b/test/CodeGen/X86/commute-blend-sse41.ll @@ -0,0 +1,34 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s + +define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 { + %1 = load <8 x i16>* %b + %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) + ret <8 x i16> %2 + + ;LABEL: commute_fold_pblendw + ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 { + %1 = load <4 x float>* %b + %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3) + ret <4 x float> %2 + + ;LABEL: commute_fold_blendps + ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] + ;CHECK-NEXT: retq +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 { + %1 = load <2 x double>* %b + %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) + ret <2 x double> %2 + + ;LABEL: commute_fold_vblendpd + ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] + ;CHECK-NEXT: retq +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone diff --git a/test/CodeGen/X86/commuted-blend-mask.ll b/test/CodeGen/X86/commuted-blend-mask.ll new file mode 100644 index 0000000..e6322cb --- /dev/null +++ b/test/CodeGen/X86/commuted-blend-mask.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s + +; When commuting the operands of a SSE blend, make sure that the resulting blend +; mask can be encoded as a imm8. +; Before, when commuting the operands to the shuffle in function @test, the backend +; produced the following assembly: +; pblendw $4294967103, %xmm1, %xmm0 + +define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) { + ;CHECK: pblendw $63, %xmm1, %xmm0 + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3> + ret <4 x i32> %shuffle +} diff --git a/test/CodeGen/X86/constant-pool-remat-0.ll b/test/CodeGen/X86/constant-pool-remat-0.ll index 4a01108..e42a87c 100644 --- a/test/CodeGen/X86/constant-pool-remat-0.ll +++ b/test/CodeGen/X86/constant-pool-remat-0.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-linux -regalloc=greedy | FileCheck %s -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-linux -mattr=+sse2 | FileCheck %s ; CHECK: LCPI ; CHECK: LCPI ; CHECK: LCPI diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll index 26318dd..3682165 100644 --- a/test/CodeGen/X86/constant-pool-sharing.ll +++ b/test/CodeGen/X86/constant-pool-sharing.ll @@ -1,12 +1,13 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=MSVC ; llc should share constant pool entries between this integer vector ; and this floating-point vector since they have the same encoding. -; CHECK: LCPI0_0(%rip), %xmm0 -; CHECK: movaps %xmm0, ({{%rdi|%rcx}}) -; CHECK: movaps %xmm0, ({{%rsi|%rdx}}) +; LINUX: LCPI0_0(%rip), %xmm0 +; MSVC: __xmm@40000000400000004000000040000000(%rip), %xmm0 +; COMMON: movaps %xmm0, ({{%rdi|%rcx}}) +; COMMON: movaps %xmm0, ({{%rsi|%rdx}}) define void @foo(<4 x i32>* %p, <4 x float>* %q, i1 %t) nounwind { entry: diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll index b578896..7160dcc 100644 --- a/test/CodeGen/X86/constructor.ll +++ b/test/CodeGen/X86/constructor.ll @@ -1,6 +1,8 @@ -; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s -; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s -@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }] +; RUN: llc -mtriple x86_64-pc-linux -use-ctors < %s | FileCheck --check-prefix=CTOR %s +; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s +@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }] + +@v = weak_odr global i8 0 define void @f() { entry: @@ -12,14 +14,14 @@ entry: ret void } -; CTOR: .section .ctors.65520,"aw",@progbits +; CTOR: .section .ctors.65520,"aGw",@progbits,v,comdat ; CTOR-NEXT: .align 8 ; CTOR-NEXT: .quad g ; CTOR-NEXT: .section .ctors,"aw",@progbits ; CTOR-NEXT: .align 8 ; CTOR-NEXT: .quad f -; INIT-ARRAY: .section .init_array.15,"aw",@init_array +; INIT-ARRAY: .section .init_array.15,"aGw",@init_array,v,comdat ; INIT-ARRAY-NEXT: .align 8 ; INIT-ARRAY-NEXT: .quad g ; INIT-ARRAY-NEXT: .section .init_array,"aw",@init_array diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll index 951b5c3..4d920e2 100644 --- a/test/CodeGen/X86/cvt16.ll +++ b/test/CodeGen/X86/cvt16.ll @@ -21,7 +21,7 @@ define void @test1(float %src, i16* %dest) { - %1 = tail call i16 @llvm.convert.to.fp16(float %src) + %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) store i16 %1, i16* %dest, align 2 ret void } @@ -34,7 +34,7 @@ define void @test1(float %src, i16* %dest) { define float @test2(i16* nocapture %src) { %1 = load i16* %src, align 2 - %2 = tail call float @llvm.convert.from.fp16(i16 %1) + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) ret float %2 } ; CHECK-LABEL: test2: @@ -45,8 +45,8 @@ define float @test2(i16* nocapture %src) { define float @test3(float %src) nounwind uwtable readnone { - %1 = tail call i16 @llvm.convert.to.fp16(float %src) - %2 = tail call float @llvm.convert.from.fp16(i16 %1) + %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) ret float %2 } @@ -59,6 +59,31 @@ define float @test3(float %src) nounwind uwtable readnone { ; F16C-NEXT: vcvtph2ps ; F16C: ret -declare float @llvm.convert.from.fp16(i16) nounwind readnone -declare i16 @llvm.convert.to.fp16(float) nounwind readnone +define double @test4(i16* nocapture %src) { + %1 = load i16* %src, align 2 + %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1) + ret double %2 +} +; CHECK-LABEL: test4: +; LIBCALL: callq __gnu_h2f_ieee +; LIBCALL: cvtss2sd +; SOFTFLOAT: callq __gnu_h2f_ieee +; SOFTFLOAT: callq __extendsfdf2 +; F16C: vcvtph2ps +; F16C: vcvtss2sd +; F16C: ret + + +define i16 @test5(double %src) { + %val = tail call i16 @llvm.convert.to.fp16.f64(double %src) + ret i16 %val +} +; CHECK-LABEL: test5: +; LIBCALL: jmp __truncdfhf2 +; SOFTFLOAT: callq __truncdfhf2 +; F16C: jmp __truncdfhf2 +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll index 4912213..d0791dc 100644 --- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll +++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll @@ -52,48 +52,48 @@ define void @_Z3barii(i32 %param1, i32 %param2) #0 { entry: %var1 = alloca %struct.AAA3, align 1 %var2 = alloca %struct.AAA3, align 1 - tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30), !dbg !47 - tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31), !dbg !47 - tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32), !dbg !49 + tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !47 + tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !47 + tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49 %tobool = icmp eq i32 %param2, 0, !dbg !50 br i1 %tobool, label %if.end, label %if.then, !dbg !50 if.then: ; preds = %entry %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52 - tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32), !dbg !49 + tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49 br label %if.end, !dbg !54 if.end: ; preds = %entry, %if.then - tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55 - tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56), !dbg !57 - tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59), !dbg !60 + tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55 + tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56, metadata !{metadata !"0x102"}), !dbg !57 + tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{metadata !"0x102"}), !dbg !60 %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61 call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61 - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63 - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64), !dbg !65 - call void @llvm.dbg.value(metadata !58, i64 0, metadata !66), !dbg !67 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !65 + call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{metadata !"0x102"}), !dbg !67 %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68 call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68 %tobool1 = icmp eq i32 %param1, 0, !dbg !69 - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63 br i1 %tobool1, label %if.else, label %if.then2, !dbg !69 if.then2: ; preds = %if.end - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71), !dbg !73 - call void @llvm.dbg.value(metadata !74, i64 0, metadata !75), !dbg !76 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71, metadata !{metadata !"0x102"}), !dbg !73 + call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{metadata !"0x102"}), !dbg !76 call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76 br label %if.end3, !dbg !72 if.else: ; preds = %if.end - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77), !dbg !79 - call void @llvm.dbg.value(metadata !80, i64 0, metadata !81), !dbg !82 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77, metadata !{metadata !"0x102"}), !dbg !79 + call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{metadata !"0x102"}), !dbg !82 call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82 br label %if.end3 if.end3: ; preds = %if.else, %if.then2 - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55 - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83), !dbg !85 - call void @llvm.dbg.value(metadata !58, i64 0, metadata !86), !dbg !87 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55 + call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83, metadata !{metadata !"0x102"}), !dbg !85 + call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{metadata !"0x102"}), !dbg !87 call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87 ret void, !dbg !88 } @@ -103,7 +103,7 @@ declare i8* @_Z5i2stri(i32) #1 declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1 ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) #2 +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } @@ -113,92 +113,92 @@ attributes #2 = { nounwind readnone } !llvm.module.flags = !{!44, !45} !llvm.ident = !{!46} -!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus] +!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus] !1 = metadata !{metadata !"dbg-changes-codegen-branch-folding.cpp", metadata !"/tmp/dbginfo"} !2 = metadata !{} !3 = metadata !{metadata !4} -!4 = metadata !{i32 786451, metadata !1, null, metadata !"AAA3", i32 4, i64 32, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ] +!4 = metadata !{metadata !"0x13\00AAA3\004\0032\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ] !5 = metadata !{metadata !6, metadata !11, metadata !17, metadata !18} -!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS4AAA3", metadata !"text", i32 8, i64 32, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ] -!7 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char] -!8 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] +!6 = metadata !{metadata !"0xd\00text\008\0032\008\000\000", metadata !1, metadata !"_ZTS4AAA3", metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ] +!7 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char] +!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] !9 = metadata !{metadata !10} -!10 = metadata !{i32 786465, i64 0, i64 4} ; [ DW_TAG_subrange_type ] [0, 3] -!11 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"", i32 5, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [AAA3] -!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!10 = metadata !{metadata !"0x21\000\004"} ; [ DW_TAG_subrange_type ] [0, 3] +!11 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3] +!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !13 = metadata !{null, metadata !14, metadata !15} -!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3] -!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ] -!16 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char] -!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [operator=] -!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator const char *", metadata !"operator const char *", metadata !"_ZNK4AAA3cvPKcEv", i32 7, metadata !19, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator const char *] -!19 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3] +!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ] +!16 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char] +!17 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=] +!18 = metadata !{metadata !"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", metadata !1, metadata !"_ZTS4AAA3", metadata !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *] +!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !20 = metadata !{metadata !15, metadata !21} -!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ] -!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3] +!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ] +!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3] !23 = metadata !{metadata !24, metadata !35, metadata !40} -!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"bar", metadata !"bar", metadata !"_Z3barii", i32 11, metadata !26, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z3barii, null, null, metadata !29, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar] -!25 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] -!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!24 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", metadata !1, metadata !25, metadata !26, null, void (i32, i32)* @_Z3barii, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar] +!25 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !27 = metadata !{null, metadata !28, metadata !28} -!28 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!28 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] !29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33, metadata !34} -!30 = metadata !{i32 786689, metadata !24, metadata !"param1", metadata !25, i32 16777227, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param1] [line 11] -!31 = metadata !{i32 786689, metadata !24, metadata !"param2", metadata !25, i32 33554443, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param2] [line 11] -!32 = metadata !{i32 786688, metadata !24, metadata !"temp", metadata !25, i32 12, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 12] -!33 = metadata !{i32 786688, metadata !24, metadata !"var1", metadata !25, i32 17, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var1] [line 17] -!34 = metadata !{i32 786688, metadata !24, metadata !"var2", metadata !25, i32 18, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 18] -!35 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=] +!30 = metadata !{metadata !"0x101\00param1\0016777227\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param1] [line 11] +!31 = metadata !{metadata !"0x101\00param2\0033554443\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param2] [line 11] +!32 = metadata !{metadata !"0x100\00temp\0012\000", metadata !24, metadata !25, metadata !15} ; [ DW_TAG_auto_variable ] [temp] [line 12] +!33 = metadata !{metadata !"0x100\00var1\0017\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17] +!34 = metadata !{metadata !"0x100\00var2\0018\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18] +!35 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !17, metadata !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=] !36 = metadata !{metadata !37, metadata !39} -!37 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0] -!38 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3] -!39 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 6] -!40 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"_ZN4AAA3C2EPKc", i32 5, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !11, metadata !41, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3] +!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0] +!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3] +!39 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 6] +!40 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !11, metadata !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3] !41 = metadata !{metadata !42, metadata !43} -!42 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0] -!43 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 5] +!42 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0] +!43 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 5] !44 = metadata !{i32 2, metadata !"Dwarf Version", i32 4} -!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 1} +!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 2} !46 = metadata !{metadata !"clang version 3.5.0 "} !47 = metadata !{i32 11, i32 0, metadata !24, null} !48 = metadata !{i8* null} !49 = metadata !{i32 12, i32 0, metadata !24, null} !50 = metadata !{i32 14, i32 0, metadata !51, null} -!51 = metadata !{i32 786443, metadata !1, metadata !24, i32 14, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!51 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] !52 = metadata !{i32 15, i32 0, metadata !53, null} -!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 14, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!53 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] !54 = metadata !{i32 16, i32 0, metadata !53, null} !55 = metadata !{i32 17, i32 0, metadata !24, null} -!56 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0] +!56 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0] !57 = metadata !{i32 0, i32 0, metadata !40, metadata !55} !58 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)} -!59 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5] +!59 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5] !60 = metadata !{i32 5, i32 0, metadata !40, metadata !55} !61 = metadata !{i32 5, i32 0, metadata !62, metadata !55} -!62 = metadata !{i32 786443, metadata !1, metadata !40, i32 5, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!62 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] !63 = metadata !{i32 18, i32 0, metadata !24, null} -!64 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0] +!64 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0] !65 = metadata !{i32 0, i32 0, metadata !40, metadata !63} -!66 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5] +!66 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5] !67 = metadata !{i32 5, i32 0, metadata !40, metadata !63} !68 = metadata !{i32 5, i32 0, metadata !62, metadata !63} !69 = metadata !{i32 20, i32 0, metadata !70, null} -!70 = metadata !{i32 786443, metadata !1, metadata !24, i32 20, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] -!71 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0] +!70 = metadata !{metadata !"0xb\0020\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!71 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0] !72 = metadata !{i32 21, i32 0, metadata !70, null} !73 = metadata !{i32 0, i32 0, metadata !35, metadata !72} !74 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)} -!75 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6] +!75 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6] !76 = metadata !{i32 6, i32 0, metadata !35, metadata !72} -!77 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0] +!77 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0] !78 = metadata !{i32 23, i32 0, metadata !70, null} !79 = metadata !{i32 0, i32 0, metadata !35, metadata !78} !80 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)} -!81 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6] +!81 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6] !82 = metadata !{i32 6, i32 0, metadata !35, metadata !78} -!83 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0] +!83 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0] !84 = metadata !{i32 24, i32 0, metadata !24, null} !85 = metadata !{i32 0, i32 0, metadata !35, metadata !84} -!86 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6] +!86 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6] !87 = metadata !{i32 6, i32 0, metadata !35, metadata !84} !88 = metadata !{i32 25, i32 0, metadata !24, null} diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll index 0b17c45..aae95e8 100644 --- a/test/CodeGen/X86/dbg-changes-codegen.ll +++ b/test/CodeGen/X86/dbg-changes-codegen.ll @@ -44,7 +44,7 @@ define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 { entry: %0 = load %struct.Foo** @pfoo, align 8 - tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62) + tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62, metadata !{metadata !"0x102"}) %cmp.i = icmp eq %struct.Foo* %0, %this ret i1 %cmp.i } @@ -53,7 +53,7 @@ entry: define void @_Z3bazv() #1 { entry: %0 = load %struct.Wibble** @wibble1, align 8 - tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65) + tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65, metadata !{metadata !"0x102"}) %1 = load %struct.Wibble** @wibble2, align 8 %cmp.i = icmp ugt %struct.Wibble* %1, %0 br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit @@ -69,15 +69,15 @@ _ZN7Flibble3barEP6Wibble.exit: ; preds = %entry, %if.then.i } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) #2 +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } -!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo] -!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble] -!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4] +!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo] +!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble] +!62 = metadata !{metadata !"0x101\00arg\0033554436\000", null, null, metadata !17} ; [ DW_TAG_arg_variable ] [arg] [line 4] !64 = metadata !{%struct.Flibble* undef} -!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13] +!65 = metadata !{metadata !"0x101\00this\0016777229\001088", null, null, metadata !45} ; [ DW_TAG_arg_variable ] [this] [line 13] diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll index 21225e3..fd07a3f 100644 --- a/test/CodeGen/X86/divide-by-constant.ll +++ b/test/CodeGen/X86/divide-by-constant.ll @@ -31,6 +31,7 @@ entry: ; CHECK-LABEL: test3: ; CHECK: movzbl 8(%esp), %eax ; CHECK-NEXT: imull $171, %eax +; CHECK-NEXT: andl $65024, %eax ; CHECK-NEXT: shrl $9, %eax ; CHECK-NEXT: ret } @@ -56,9 +57,10 @@ entry: %div = sdiv i16 %x, 10 ret i16 %div ; CHECK-LABEL: test6: -; CHECK: imull $26215, %eax, %ecx -; CHECK: sarl $18, %ecx -; CHECK: shrl $15, %eax +; CHECK: imull $26215, %eax +; CHECK: movl %eax, %ecx +; CHECK: shrl $31, %ecx +; CHECK: sarl $18, %eax } define i32 @test7(i32 %x) nounwind { diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll new file mode 100644 index 0000000..ec367c8 --- /dev/null +++ b/test/CodeGen/X86/divrem8_ext.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64 +; RUN: llc -march=x86 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32 +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_udivrem_zext_ah +; CHECK: divb +; CHECK: movzbl %ah, [[REG_REM:%[a-z0-9]+]] +; CHECK: movb %al, ([[REG_ZPTR:%[a-z0-9]+]]) +; CHECK: movl [[REG_REM]], %eax +; CHECK: ret + %div = udiv i8 %x, %y + store i8 %div, i8* @z + %1 = urem i8 %x, %y + ret i8 %1 +} + +define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_zext_ah +; CHECK: divb +; CHECK: movzbl %ah, %eax +; CHECK: ret + %1 = urem i8 %x, %y + ret i8 %1 +} + +define i8 @test_urem_noext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_noext_ah +; CHECK: divb [[REG_X:%[a-z0-9]+]] +; CHECK: movzbl %ah, %eax +; CHECK: addb [[REG_X]], %al +; CHECK: ret + %1 = urem i8 %x, %y + %2 = add i8 %1, %y + ret i8 %2 +} + +define i64 @test_urem_zext64_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_zext64_ah +; CHECK: divb +; CHECK: movzbl %ah, %eax +; CHECK-32: xorl %edx, %edx +; CHECK: ret + %1 = urem i8 %x, %y + %2 = zext i8 %1 to i64 + ret i64 %2 +} + +define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_sdivrem_sext_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, [[REG_REM:%[a-z0-9]+]] +; CHECK: movb %al, ([[REG_ZPTR]]) +; CHECK: movl [[REG_REM]], %eax +; CHECK: ret + %div = sdiv i8 %x, %y + store i8 %div, i8* @z + %1 = srem i8 %x, %y + ret i8 %1 +} + +define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_sext_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, %eax +; CHECK: ret + %1 = srem i8 %x, %y + ret i8 %1 +} + +define i8 @test_srem_noext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_noext_ah +; CHECK: cbtw +; CHECK: idivb [[REG_X:%[a-z0-9]+]] +; CHECK: movsbl %ah, %eax +; CHECK: addb [[REG_X]], %al +; CHECK: ret + %1 = srem i8 %x, %y + %2 = add i8 %1, %y + ret i8 %2 +} + +define i64 @test_srem_sext64_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_sext64_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, %eax +; CHECK-32: movl %eax, %edx +; CHECK-32: sarl $31, %edx +; CHECK-64: movsbq %al, %rax +; CHECK: ret + %1 = srem i8 %x, %y + %2 = sext i8 %1 to i64 + ret i64 %2 +} + +@z = external global i8 diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll index 0d5afa1..c673f5d 100644 --- a/test/CodeGen/X86/dllexport-x86_64.ll +++ b/test/CodeGen/X86/dllexport-x86_64.ll @@ -70,7 +70,7 @@ define weak_odr dllexport void @weak1() { ; CHECK: .weak weak_alias ; CHECK: weak_alias = f1 -@weak_alias = dllexport alias weak_odr void()* @f1 +@weak_alias = weak_odr dllexport alias void()* @f1 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16 @blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*) diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll index e2c3f13..5035aa1 100644 --- a/test/CodeGen/X86/dllexport.ll +++ b/test/CodeGen/X86/dllexport.ll @@ -89,7 +89,7 @@ define weak_odr dllexport void @weak1() { ; CHECK: .weak _weak_alias ; CHECK: _weak_alias = _f1 -@weak_alias = dllexport alias weak_odr void()* @f1 +@weak_alias = weak_odr dllexport alias void()* @f1 ; CHECK: .section .drectve diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll index 666409f..839bca4 100644 --- a/test/CodeGen/X86/dllimport-x86_64.ll +++ b/test/CodeGen/X86/dllimport-x86_64.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST ; PR6275 ; -; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT +; RUN: opt -mtriple x86_64-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT @Var1 = external dllimport global i32 @Var2 = available_externally dllimport unnamed_addr constant i32 1 diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll index 695bfce..231ad65 100644 --- a/test/CodeGen/X86/dllimport.ll +++ b/test/CodeGen/X86/dllimport.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST ; PR6275 ; -; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT +; RUN: opt -mtriple i386-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT @Var1 = external dllimport global i32 @Var2 = available_externally dllimport unnamed_addr constant i32 1 diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll new file mode 100644 index 0000000..24d9533 --- /dev/null +++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=x86 < %s | FileCheck %s + +; CHECK-LABEL: @bar +; CHECK: movl $1074339512, +; CHECK: movl $1374389535, +; CHECK: movl $1078523331, +define void @bar() unnamed_addr { +entry-block: + %a = alloca double + %b = alloca float + + store double 3.140000e+00, double* %a + %0 = load double* %a + + %1 = fptrunc double %0 to float + + store float %1, float* %b + + ret void +} diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll index c8d7527..872f7fa 100644 --- a/test/CodeGen/X86/dwarf-comp-dir.ll +++ b/test/CodeGen/X86/dwarf-comp-dir.ll @@ -7,15 +7,15 @@ target triple = "x86_64-unknown-linux-gnu" !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!5} -!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] +!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !7, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ] !2 = metadata !{} -!3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ] !4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"} -!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ] +!6 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !4, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ] !7 = metadata !{metadata !6} ; The important part of the following check is that dir = #0. ; Dir Mod Time File Len File Name ; ---- ---------- ---------- --------------------------- ; CHECK: file_names[ 1] 0 0x00000000 0x00000000 empty.c -!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll new file mode 100644 index 0000000..f019bed --- /dev/null +++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll @@ -0,0 +1,44 @@ +; RUN: llc -no-stack-coloring=false < %s | FileCheck %s + +; This test crashed in PEI because the stack protector was dead. +; This was due to it being colored, which was in turn due to incorrect +; lifetimes being applied to the stack protector frame index. + +; CHECK: stack_chk_guard + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.10.0" + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +; Function Attrs: ssp +define void @foo(i1 %cond1, i1 %cond2) #1 { +entry: + %bitmapBuffer = alloca [8192 x i8], align 1 + br i1 %cond1, label %end1, label %bb1 + +bb1: + %bitmapBuffer229 = alloca [8192 x i8], align 1 + br i1 %cond2, label %end1, label %if.else130 + +end1: + ret void + +if.else130: ; preds = %bb1 + %tmp = getelementptr inbounds [8192 x i8]* %bitmapBuffer, i32 0, i32 0 + call void @llvm.lifetime.start(i64 8192, i8* %tmp) #0 + call void @llvm.lifetime.end(i64 8192, i8* %tmp) #0 + %tmp25 = getelementptr inbounds [8192 x i8]* %bitmapBuffer229, i32 0, i32 0 + call void @llvm.lifetime.start(i64 8192, i8* %tmp25) #0 + call void @llvm.lifetime.end(i64 8192, i8* %tmp25) #0 + br label %end1 +} + +declare void @bar() + +attributes #0 = { nounwind } +attributes #1 = { ssp }
\ No newline at end of file diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll index ac5174d..4234968 100644 --- a/test/CodeGen/X86/empty-functions.ll +++ b/test/CodeGen/X86/empty-functions.ll @@ -1,10 +1,14 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s define void @func() { entry: unreachable } + +; MachO cannot handle an empty function. ; CHECK-NO-FP: _func: ; CHECK-NO-FP-NEXT: .cfi_startproc ; CHECK-NO-FP: nop @@ -21,5 +25,30 @@ entry: ; CHECK-FP-NEXT: movq %rsp, %rbp ; CHECK-FP-NEXT: : ; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp -; CHECK-FP-NEXT: nop ; CHECK-FP-NEXT: .cfi_endproc + +; An empty function is perfectly fine on ELF. +; LINUX-NO-FP: func: +; LINUX-NO-FP-NEXT: .cfi_startproc +; LINUX-NO-FP-NEXT: {{^}}# +; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}} +; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func +; LINUX-NO-FP-NEXT: .cfi_endproc + +; A cfi directive can point to the end of a function. It (and in fact the +; entire body) could be optimized out because of the unreachable, but we +; don't do it right now. +; LINUX-FP: func: +; LINUX-FP-NEXT: .cfi_startproc +; LINUX-FP-NEXT: {{^}}# +; LINUX-FP-NEXT: pushq %rbp +; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .cfi_def_cfa_offset 16 +; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .cfi_offset %rbp, -16 +; LINUX-FP-NEXT: movq %rsp, %rbp +; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp +; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .size func, .Ltmp3-func +; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll index a18f751..ab92fe0 100644 --- a/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -93,10 +93,11 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> % ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg -; ExeDepsFix works top down, thus it coalesces vmovlhps domain with -; vandps and there is nothing more you can do to match vmaxpd. -; CHECK: vmovlhps -; CHECK: vandps +; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with +; vpand and there is nothing more you can do to match vmaxpd. +; CHECK: vmovq +; CHECK: vpbroadcastq +; CHECK: vpand ; CHECK: vmaxpd ; CHECK: ret define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) { diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index cadc0fb..8647599 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s ; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + define i32 @t(<2 x i64>* %val) nounwind { ; CHECK-LABEL: t: ; CHECK-NOT: movd @@ -23,3 +25,40 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3> %y = extractelement <8 x i32> %Shuff68, i32 0 ret i32 %y } + +; This case could easily end up inf-looping in the DAG combiner due to an +; low alignment load of the vector which prevents us from reliably forming a +; narrow load. +; FIXME: It would be nice to detect whether the target has fast and legal +; unaligned loads and use them here. +define void @t3() { +; CHECK-LABEL: t3: +; +; This movs the entire vector, shuffling the high double down. If we fixed the +; FIXME above it would just move the high double directly. +; CHECK: movupd +; CHECK: shufpd +; CHECK: movlpd + +bb: + %tmp13 = load <2 x double>* undef, align 1 + %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1 + store double %.sroa.3.24.vec.extract, double* undef, align 8 + unreachable +} + +; Case where a load is unary shuffled, then bitcast (to a type with the same +; number of elements) before extractelement. +; This is testing for an assertion - the extraction was assuming that the undef +; second shuffle operand was a post-bitcast type instead of a pre-bitcast type. +define i64 @t4(<2 x double>* %a) { +; CHECK-LABEL: t4: +; CHECK: mov +; CHECK: ret + %b = load <2 x double>* %a, align 16 + %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0> + %d = bitcast <2 x double> %c to <2 x i64> + %e = extractelement <2 x i64> %d, i32 1 + ret i64 %e +} + diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll index 7467edd..7e783d2 100644 --- a/test/CodeGen/X86/fast-isel-args-fail.ll +++ b/test/CodeGen/X86/fast-isel-args-fail.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64 -; REQUIRES: asserts ; Previously, this would cause an assert. define i31 @t1(i31 %a, i31 %b, i31 %c) { diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll index a3f6851..0df782d 100644 --- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll +++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll @@ -351,7 +351,7 @@ bb1: define i32 @icmp_eq(i32 %x) { ; CHECK-LABEL: icmp_eq ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp eq i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -387,7 +387,7 @@ bb1: define i32 @icmp_uge(i32 %x) { ; CHECK-LABEL: icmp_uge ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp uge i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -411,7 +411,7 @@ bb1: define i32 @icmp_ule(i32 %x) { ; CHECK-LABEL: icmp_ule ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp ule i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -435,7 +435,7 @@ bb1: define i32 @icmp_sge(i32 %x) { ; CHECK-LABEL: icmp_sge ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp sge i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -459,7 +459,7 @@ bb1: define i32 @icmp_sle(i32 %x) { ; CHECK-LABEL: icmp_sle ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp sle i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll index bbbaeb2..4e6f7c0 100644 --- a/test/CodeGen/X86/fast-isel-constpool.ll +++ b/test/CodeGen/X86/fast-isel-constpool.ll @@ -1,19 +1,23 @@ -; RUN: llc < %s -fast-isel | FileCheck %s -; CHECK: LCPI0_0(%rip) +; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large < %s | FileCheck %s --check-prefix=LARGE -; Make sure fast isel uses rip-relative addressing when required. -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -target triple = "x86_64-apple-darwin9.0" +; Make sure fast isel uses rip-relative addressing for the small code model. +define float @constpool_float(float %x) { +; CHECK-LABEL: constpool_float +; CHECK: LCPI0_0(%rip) -define i32 @f0(double %x) nounwind { -entry: - %retval = alloca i32 ; <i32*> [#uses=2] - %x.addr = alloca double ; <double*> [#uses=2] - store double %x, double* %x.addr - %tmp = load double* %x.addr ; <double> [#uses=1] - %cmp = fcmp olt double %tmp, 8.500000e-01 ; <i1> [#uses=1] - %conv = zext i1 %cmp to i32 ; <i32> [#uses=1] - store i32 %conv, i32* %retval - %0 = load i32* %retval ; <i32> [#uses=1] - ret i32 %0 +; LARGE-LABEL: constpool_float +; LARGE: movabsq $LCPI0_0, %rax + %1 = fadd float %x, 16.50e+01 + ret float %1 +} + +define double @constpool_double(double %x) nounwind { +; CHECK-LABEL: constpool_double +; CHECK: LCPI1_0(%rip) + +; LARGE-LABEL: constpool_double +; LARGE: movabsq $LCPI1_0, %rax + %1 = fadd double %x, 8.500000e-01 + ret double %1 } diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll index cd2dc1d..eca1ae9 100644 --- a/test/CodeGen/X86/fast-isel-mem.ll +++ b/test/CodeGen/X86/fast-isel-mem.ll @@ -36,11 +36,11 @@ entry: store i32 (...)** getelementptr ([4 x i32 (...)*]* @LotsStuff, i32 0, i32 2), i32 (...)*** null, align 4 ret void ; CHECK: _t: -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax ; CHECK: movl L_LotsStuff$non_lazy_ptr, %ecx ; ATOM: _t: ; ATOM: movl L_LotsStuff$non_lazy_ptr, %e{{..}} -; ATOM: movl $0, %e{{..}} +; ATOM: xorl %e{{..}}, %e{{..}} } diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll index f71abd2..686df43 100644 --- a/test/CodeGen/X86/fast-isel-tls.ll +++ b/test/CodeGen/X86/fast-isel-tls.ll @@ -13,7 +13,7 @@ entry: ; CHECK: leal v@TLSGD ; CHECK: __tls_get_addr -@alias = alias internal i32* @v +@alias = internal alias i32* @v define i32 @f_alias() nounwind { entry: %t = load i32* @v diff --git a/test/CodeGen/X86/fast-isel-x32.ll b/test/CodeGen/X86/fast-isel-x32.ll new file mode 100644 index 0000000..d49a108 --- /dev/null +++ b/test/CodeGen/X86/fast-isel-x32.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel -fast-isel-abort | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s + +; Test that alloca addresses are materialized with the right size instruction. + +declare void @bar(i32* %arg) + +; CHECK-LABEL: @foo +define void @foo() { + %a = alloca i32 +; CHECK: leal {{.*}}, %edi + call void @bar(i32* %a) + ret void +} diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll index f7d2750..3747d04 100644 --- a/test/CodeGen/X86/fast-isel-x86-64.ll +++ b/test/CodeGen/X86/fast-isel-x86-64.ll @@ -144,7 +144,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-LABEL: test12: ; CHECK: testb $1, ; CHECK-NEXT: je L -; CHECK-NEXT: movl $0, %edi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq } @@ -154,7 +154,7 @@ define void @test13() nounwind { call void @test13f(i1 0) ret void ; CHECK-LABEL: test13: -; CHECK: movl $0, %edi +; CHECK: xorl %edi, %edi ; CHECK-NEXT: callq } @@ -194,12 +194,10 @@ define void @test16() nounwind { br label %block2 block2: -; CHECK: movabsq $1 -; CHECK: cvtsi2sdq {{.*}} %xmm0 +; CHECK: movsd LCP{{.*}}_{{.*}}(%rip), %xmm0 ; CHECK: movb $1, %al ; CHECK: callq _test16callee -; AVX: movabsq $1 ; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0 ; AVX: movb $1, %al ; AVX: callq _test16callee @@ -280,7 +278,7 @@ entry: call void @foo22(i32 3) ret void ; CHECK-LABEL: test22: -; CHECK: movl $0, %edi +; CHECK: xorl %edi, %edi ; CHECK: callq _foo22 ; CHECK: movl $1, %edi ; CHECK: callq _foo22 @@ -304,3 +302,13 @@ define void @test23(i8* noalias sret %result) { } declare i8* @foo23() + +declare void @takesi32ptr(i32* %arg) + +; CHECK-LABEL: allocamaterialize +define void @allocamaterialize() { + %a = alloca i32 +; CHECK: leaq {{.*}}, %rdi + call void @takesi32ptr(i32* %a) + ret void +} diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll index a212a7c..61e9b98 100644 --- a/test/CodeGen/X86/fast-isel-x86.ll +++ b/test/CodeGen/X86/fast-isel-x86.ll @@ -60,3 +60,21 @@ entry: ; CHECK: addl $28 } declare fastcc void @test4fastccsret(%struct.a* sret) + + +; Check that fast-isel cleans up when it fails to lower a call instruction. +define void @test5() { +entry: + %call = call i32 @test5dllimport(i32 42) + ret void +; CHECK-LABEL: test5: +; Local value area is still there: +; CHECK: movl $42, {{%[a-z]+}} +; Fast-ISel's arg push is not here: +; CHECK-NOT: movl $42, (%esp) +; SDag-ISel's arg push: +; CHECK: movl %esp, [[REGISTER:%[a-z]+]] +; CHECK: movl $42, ([[REGISTER]]) +; CHECK: movl __imp__test5dllimport +} +declare dllimport i32 @test5dllimport(i32) diff --git a/test/CodeGen/X86/fastmath-optnone.ll b/test/CodeGen/X86/fastmath-optnone.ll new file mode 100644 index 0000000..0caadff --- /dev/null +++ b/test/CodeGen/X86/fastmath-optnone.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2 | FileCheck %s +; Verify that floating-point operations inside 'optnone' functions +; are not optimized even if unsafe-fp-math is set. + +define float @foo(float %x) #0 { +entry: + %add = fadd fast float %x, %x + %add1 = fadd fast float %add, %x + ret float %add1 +} + +; CHECK-LABEL: @foo +; CHECK-NOT: add +; CHECK: mul +; CHECK-NOT: add +; CHECK: ret + +define float @fooWithOptnone(float %x) #1 { +entry: + %add = fadd fast float %x, %x + %add1 = fadd fast float %add, %x + ret float %add1 +} + +; CHECK-LABEL: @fooWithOptnone +; CHECK-NOT: mul +; CHECK: add +; CHECK-NOT: mul +; CHECK: add +; CHECK-NOT: mul +; CHECK: ret + + +attributes #0 = { "unsafe-fp-math"="true" } +attributes #1 = { noinline optnone "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll index 494cb28..aadd731 100644 --- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll +++ b/test/CodeGen/X86/fma-intrinsics-x86_64.ll @@ -1,316 +1,278 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK ; VFMADD define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmaddss - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) { - ; CHECK: vfmaddss (%{{.*}}) - %x = load float *%a2 - %y = insertelement <4 x float> undef, float %x, i32 0 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) { - ; CHECK: vfmaddss %{{.*}}, (%{{.*}}) - %x = load float *%a1 - %y = insertelement <4 x float> undef, float %x, i32 0 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddss + ; CHECK-FMA: vfmadd213ss + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmaddsd - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) { - ; CHECK: vfmaddsd (%{{.*}}) - %x = load double *%a2 - %y = insertelement <2 x double> undef, double %x, i32 0 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) { - ; CHECK: vfmaddsd %{{.*}}, (%{{.*}}) - %x = load double *%a1 - %y = insertelement <2 x double> undef, double %x, i32 0 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddsd + ; CHECK-FMA: vfmadd213sd + %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmaddps - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) { - ; CHECK: vfmaddps (%{{.*}}) - %x = load <4 x float>* %a2 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) { - ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) - %x = load <4 x float>* %a1 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddps + ; CHECK-FMA: vfmadd213ps + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone -; To test execution dependency -define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) { - ; CHECK: vmovaps - ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) - %x = load <4 x float>* %a0 - %y = load <4 x float>* %a1 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1] - ret < 4 x float > %res -} - define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmaddpd - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) { - ; CHECK: vfmaddpd (%{{.*}}) - %x = load <2 x double>* %a2 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) { - ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) - %x = load <2 x double>* %a1 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddpd + ; CHECK-FMA: vfmadd213pd + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone -; To test execution dependency -define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) { - ; CHECK: vmovapd - ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) - %x = load <2 x double>* %a0 - %y = load <2 x double>* %a1 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1] - ret < 2 x double > %res -} - define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmaddps + ; CHECK-FMA4: vfmaddps + ; CHECK-FMA: vfmadd213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmaddpd + ; CHECK-FMA4: vfmaddpd + ; CHECK-FMA: vfmadd213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFMSUB define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmsubss - %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubss + ; CHECK-FMA: vfmsub213ss + %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmsubsd - %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubsd + ; CHECK-FMA: vfmsub213sd + %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmsubps - %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubps + ; CHECK-FMA: vfmsub213ps + %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmsubpd - %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubpd + ; CHECK-FMA: vfmsub213pd + %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmsubps + ; CHECK-FMA4: vfmsubps + ; CHECK-FMA: vfmsub213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmsubpd + ; CHECK-FMA4: vfmsubpd + ; CHECK-FMA: vfmsub213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFNMADD define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmaddss - %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddss + ; CHECK-FMA: vfnmadd213ss + %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmaddsd - %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddsd + ; CHECK-FMA: vfnmadd213sd + %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmaddps - %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddps + ; CHECK-FMA: vfnmadd213ps + %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmaddpd - %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddpd + ; CHECK-FMA: vfnmadd213pd + %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfnmaddps + ; CHECK-FMA4: vfnmaddps + ; CHECK-FMA: vfnmadd213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfnmaddpd + ; CHECK-FMA4: vfnmaddpd + ; CHECK-FMA: vfnmadd213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFNMSUB define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmsubss - %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubss + ; CHECK-FMA: vfnmsub213ss + %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmsubsd - %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubsd + ; CHECK-FMA: vfnmsub213sd + %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmsubps - %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubps + ; CHECK-FMA: vfnmsub213ps + %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmsubpd - %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubpd + ; CHECK-FMA: vfnmsub213pd + %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfnmsubps + ; CHECK-FMA4: vfnmsubps + ; CHECK-FMA: vfnmsub213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfnmsubpd + ; CHECK-FMA4: vfnmsubpd + ; CHECK-FMA: vfnmsub213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFMADDSUB define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmaddsubps - %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddsubps + ; CHECK-FMA: vfmaddsub213ps + %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmaddsubpd - %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddsubpd + ; CHECK-FMA: vfmaddsub213pd + %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmaddsubps + ; CHECK-FMA4: vfmaddsubps + ; CHECK-FMA: vfmaddsub213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmaddsubpd + ; CHECK-FMA4: vfmaddsubpd + ; CHECK-FMA: vfmaddsub213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFMSUBADD define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmsubaddps - %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubaddps + ; CHECK-FMA: vfmsubadd213ps + %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmsubaddpd - %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubaddpd + ; CHECK-FMA: vfmsubadd213pd + %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmsubaddps + ; CHECK-FMA4: vfmsubaddps + ; CHECK-FMA: vfmsubadd213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmsubaddpd + ; CHECK-FMA4: vfmsubaddpd + ; CHECK-FMA: vfmsubadd213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll new file mode 100644 index 0000000..9715bc7 --- /dev/null +++ b/test/CodeGen/X86/fma-phi-213-to-231.ll @@ -0,0 +1,246 @@ +; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: fmaddsubpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmaddpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + + +; CHECK-LABEL: fmaddsubps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmaddps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll index 47252ec..2eb152b 100644 --- a/test/CodeGen/X86/fma.ll +++ b/test/CodeGen/X86/fma.ll @@ -43,8 +43,8 @@ entry: } ; Test FMA3 variant selection -; CHECK: fma3_select231ssX: -; CHECK: vfmadd231ss xmm +; CHECK-FMA-INST: fma3_select231ssX: +; CHECK-FMA-INST: vfmadd231ss %xmm define float @fma3_select231ssX(float %x, float %y) #0 { entry: br label %while.body @@ -58,8 +58,8 @@ while.end: ; preds = %while.body, %entry } ; Test FMA3 variant selection -; CHECK: fma3_select231pdY: -; CHECK: vfmadd231pd ymm +; CHECK-FMA-INST: fma3_select231pdY: +; CHECK-FMA-INST: vfmadd231pd %ymm define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 { entry: br label %while.body diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll new file mode 100644 index 0000000..64a2068 --- /dev/null +++ b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s + +; VFMADD +define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) { + ; CHECK: vfmaddss (%{{.*}}) + %x = load float *%a2 + %y = insertelement <4 x float> undef, float %x, i32 0 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) + ret < 4 x float > %res +} +define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) { + ; CHECK: vfmaddss %{{.*}}, (%{{.*}}) + %x = load float *%a1 + %y = insertelement <4 x float> undef, float %x, i32 0 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) + ret < 4 x float > %res +} + +declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) { + ; CHECK: vfmaddsd (%{{.*}}) + %x = load double *%a2 + %y = insertelement <2 x double> undef, double %x, i32 0 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) + ret < 2 x double > %res +} +define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) { + ; CHECK: vfmaddsd %{{.*}}, (%{{.*}}) + %x = load double *%a1 + %y = insertelement <2 x double> undef, double %x, i32 0 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone +define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) { + ; CHECK: vfmaddps (%{{.*}}) + %x = load <4 x float>* %a2 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) + ret < 4 x float > %res +} +define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) { + ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) + %x = load <4 x float>* %a1 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +; To test execution dependency +define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) { + ; CHECK: vmovaps + ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) + %x = load <4 x float>* %a0 + %y = load <4 x float>* %a1 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) + ret < 4 x float > %res +} + +define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) { + ; CHECK: vfmaddpd (%{{.*}}) + %x = load <2 x double>* %a2 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) + ret < 2 x double > %res +} +define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) { + ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) + %x = load <2 x double>* %a1 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +; To test execution dependency +define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) { + ; CHECK: vmovapd + ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) + %x = load <2 x double>* %a0 + %y = load <2 x double>* %a1 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) + ret < 2 x double > %res +} + diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index cfb598d..9b52db9 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -184,7 +184,7 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) { ; CHECK: test_x86_fmadd_ps_load ; CHECK: vmovaps (%rdi), %xmm2 -; CHECK: vfmadd213ps %xmm1, %xmm0, %xmm2 +; CHECK: vfmadd213ps %xmm1, %xmm2, %xmm0 ; CHECK: ret ; CHECK_FMA4: test_x86_fmadd_ps_load ; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 @@ -198,7 +198,7 @@ define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 ; CHECK: test_x86_fmsub_ps_load ; CHECK: vmovaps (%rdi), %xmm2 -; CHECK: fmsub213ps %xmm1, %xmm0, %xmm2 +; CHECK: fmsub213ps %xmm1, %xmm2, %xmm0 ; CHECK: ret ; CHECK_FMA4: test_x86_fmsub_ps_load ; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0 diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll new file mode 100644 index 0000000..23678c4 --- /dev/null +++ b/test/CodeGen/X86/fmaxnum.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=x86 -mtriple=i386-linux-gnu < %s | FileCheck %s + +declare float @fmaxf(float, float) +declare double @fmax(double, double) +declare x86_fp80 @fmaxl(x86_fp80, x86_fp80) +declare float @llvm.maxnum.f32(float, float) +declare double @llvm.maxnum.f64(double, double) +declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) + +; CHECK-LABEL: @test_fmaxf +; CHECK: calll fmaxf +define float @test_fmaxf(float %x, float %y) { + %z = call float @fmaxf(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_fmax +; CHECK: calll fmax +define double @test_fmax(double %x, double %y) { + %z = call double @fmax(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_fmaxl +; CHECK: calll fmaxl +define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} + +; CHECK-LABEL: @test_intrinsic_fmaxf +; CHECK: calll fmaxf +define float @test_intrinsic_fmaxf(float %x, float %y) { + %z = call float @llvm.maxnum.f32(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_intrinsic_fmax +; CHECK: calll fmax +define double @test_intrinsic_fmax(double %x, double %y) { + %z = call double @llvm.maxnum.f64(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_intrinsic_fmaxl +; CHECK: calll fmaxl +define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll new file mode 100644 index 0000000..1e33cf4 --- /dev/null +++ b/test/CodeGen/X86/fminnum.ll @@ -0,0 +1,95 @@ +; RUN: llc -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s + +declare float @fminf(float, float) +declare double @fmin(double, double) +declare x86_fp80 @fminl(x86_fp80, x86_fp80) +declare float @llvm.minnum.f32(float, float) +declare double @llvm.minnum.f64(double, double) +declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) + +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @test_fminf +; CHECK: jmp fminf +define float @test_fminf(float %x, float %y) { + %z = call float @fminf(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_fmin +; CHECK: jmp fmin +define double @test_fmin(double %x, double %y) { + %z = call double @fmin(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_fminl +; CHECK: calll fminl +define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} + +; CHECK-LABEL: @test_intrinsic_fminf +; CHECK: jmp fminf +define float @test_intrinsic_fminf(float %x, float %y) { + %z = call float @llvm.minnum.f32(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_intrinsic_fmin +; CHECK: jmp fmin +define double @test_intrinsic_fmin(double %x, double %y) { + %z = call double @llvm.minnum.f64(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_intrinsic_fminl +; CHECK: calll fminl +define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v2f32 +; CHECK: calll fminf +; CHECK: calll fminf +define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) { + %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone + ret <2 x float> %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v4f32 +; CHECK: calll fminf +; CHECK: calll fminf +; CHECK: calll fminf +; CHECK: calll fminf +define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) { + %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone + ret <4 x float> %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v2f64 +; CHECK: calll fmin +; CHECK: calll fmin +define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) { + %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone + ret <2 x double> %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v8f64 +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) { + %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone + ret <8 x double> %z +} diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll new file mode 100644 index 0000000..7036511 --- /dev/null +++ b/test/CodeGen/X86/fmul-combines.ll @@ -0,0 +1,147 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 < %s | FileCheck %s + +; CHECK-LABEL: fmul2_f32: +; CHECK: addss %xmm0, %xmm0 +define float @fmul2_f32(float %x) { + %y = fmul float %x, 2.0 + ret float %y +} + +; fmul 2.0, x -> fadd x, x for vectors. + +; CHECK-LABEL: fmul2_v4f32: +; CHECK: addps %xmm0, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul2_v4f32(<4 x float> %x) { + %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0> + ret <4 x float> %y +} + +; CHECK-LABEL: constant_fold_fmul_v4f32: +; CHECK: movaps +; CHECK-NEXT: ret +define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) { + %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0> + ret <4 x float> %y +} + +; CHECK-LABEL: fmul0_v4f32: +; CHECK: xorps %xmm0, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul0_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 0.0, float 0.0, float 0.0, float 0.0> + ret <4 x float> %y +} + +; CHECK-LABEL: fmul_c2_c4_v4f32: +; CHECK-NOT: addps +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_c2_c4_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0> + %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0> + ret <4 x float> %z +} + +; CHECK-LABEL: fmul_c3_c4_v4f32: +; CHECK-NOT: addps +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 3.0, float 3.0, float 3.0, float 3.0> + %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0> + ret <4 x float> %z +} + +; We should be able to pre-multiply the two constant vectors. +; CHECK: float 5.000000e+00 +; CHECK: float 1.200000e+01 +; CHECK: float 2.100000e+01 +; CHECK: float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0> + %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0> + ret <4 x float> %z +} + +; Same as above, but reverse operands to make sure non-canonical form is also handled. +; CHECK: float 5.000000e+00 +; CHECK: float 1.200000e+01 +; CHECK: float 2.100000e+01 +; CHECK: float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 { + %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x + %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y + ret <4 x float> %z +} + +; More than one use of a constant multiply should not inhibit the optimization. +; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. +; CHECK: float 5.000000e+00 +; CHECK: float 1.200000e+01 +; CHECK: float 2.100000e+01 +; CHECK: float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: +; CHECK: mulps +; CHECK: mulps +; CHECK: addps +; CHECK: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0> + %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0> + %a = fadd <4 x float> %y, %z + ret <4 x float> %a +} + +; CHECK-LABEL: fmul_c2_c4_f32: +; CHECK-NOT: addss +; CHECK: mulss +; CHECK-NOT: mulss +; CHECK-NEXT: ret +define float @fmul_c2_c4_f32(float %x) #0 { + %y = fmul float %x, 2.0 + %z = fmul float %y, 4.0 + ret float %z +} + +; CHECK-LABEL: fmul_c3_c4_f32: +; CHECK-NOT: addss +; CHECK: mulss +; CHECK-NOT: mulss +; CHECK-NET: ret +define float @fmul_c3_c4_f32(float %x) #0 { + %y = fmul float %x, 3.0 + %z = fmul float %y, 4.0 + ret float %z +} + +; CHECK-LABEL: fmul_fneg_fneg_f32: +; CHECK: mulss %xmm1, %xmm0 +; CHECK-NEXT: retq +define float @fmul_fneg_fneg_f32(float %x, float %y) { + %x.neg = fsub float -0.0, %x + %y.neg = fsub float -0.0, %y + %mul = fmul float %x.neg, %y.neg + ret float %mul +} +; CHECK-LABEL: fmul_fneg_fneg_v4f32: +; CHECK: mulps {{%xmm1|\(%rdx\)}}, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul_fneg_fneg_v4f32(<4 x float> %x, <4 x float> %y) { + %x.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %x + %y.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %y + %mul = fmul <4 x float> %x.neg, %y.neg + ret <4 x float> %mul +} + +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/fnabs.ll b/test/CodeGen/X86/fnabs.ll new file mode 100644 index 0000000..19718d3 --- /dev/null +++ b/test/CodeGen/X86/fnabs.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx| FileCheck %s + +; Verify that we generate a single OR instruction for a scalar, vec128, and vec256 +; FNABS(x) operation -> FNEG (FABS(x)). +; If the FABS() result isn't used, the AND instruction should be eliminated. +; PR20578: http://llvm.org/bugs/show_bug.cgi?id=20578 + +define float @scalar_no_abs(float %a) { +; CHECK-LABEL: scalar_no_abs: +; CHECK: vorps +; CHECK-NEXT: retq + %fabs = tail call float @fabsf(float %a) #1 + %fsub = fsub float -0.0, %fabs + ret float %fsub +} + +define float @scalar_uses_abs(float %a) { +; CHECK-LABEL: scalar_uses_abs: +; CHECK-DAG: vandps +; CHECK-DAG: vorps +; CHECK: vmulss +; CHECK-NEXT: retq + %fabs = tail call float @fabsf(float %a) #1 + %fsub = fsub float -0.0, %fabs + %fmul = fmul float %fsub, %fabs + ret float %fmul +} + +define <4 x float> @vector128_no_abs(<4 x float> %a) { +; CHECK-LABEL: vector128_no_abs: +; CHECK: vorps +; CHECK-NEXT: retq + %fabs = tail call <4 x float> @llvm.fabs.v4f32(< 4 x float> %a) #1 + %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + ret <4 x float> %fsub +} + +define <4 x float> @vector128_uses_abs(<4 x float> %a) { +; CHECK-LABEL: vector128_uses_abs: +; CHECK-DAG: vandps +; CHECK-DAG: vorps +; CHECK: vmulps +; CHECK-NEXT: retq + %fabs = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #1 + %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + %fmul = fmul <4 x float> %fsub, %fabs + ret <4 x float> %fmul +} + +define <8 x float> @vector256_no_abs(<8 x float> %a) { +; CHECK-LABEL: vector256_no_abs: +; CHECK: vorps +; CHECK-NEXT: retq + %fabs = tail call <8 x float> @llvm.fabs.v8f32(< 8 x float> %a) #1 + %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + ret <8 x float> %fsub +} + +define <8 x float> @vector256_uses_abs(<8 x float> %a) { +; CHECK-LABEL: vector256_uses_abs: +; CHECK-DAG: vandps +; CHECK-DAG: vorps +; CHECK: vmulps +; CHECK-NEXT: retq + %fabs = tail call <8 x float> @llvm.fabs.v8f32(<8 x float> %a) #1 + %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + %fmul = fmul <8 x float> %fsub, %fabs + ret <8 x float> %fmul +} + +declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p) +declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p) + +declare float @fabsf(float) + +attributes #1 = { readnone } + diff --git a/test/CodeGen/X86/fold-pcmpeqd-0.ll b/test/CodeGen/X86/fold-pcmpeqd-0.ll deleted file mode 100644 index 1d315ff..0000000 --- a/test/CodeGen/X86/fold-pcmpeqd-0.ll +++ /dev/null @@ -1,117 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s -; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s - -; i386 test has been disabled when scheduler 2-addr hack is disabled. - -; This testcase shouldn't need to spill the -1 value, -; so it should just use pcmpeqd to materialize an all-ones vector. -; For i386, cp load of -1 are folded. - -; With -regalloc=greedy, the live range is split before spilling, so the first -; pcmpeq doesn't get folded as a constant pool load. - -; I386-NOT: pcmpeqd -; I386: orps LCPI0_2, %xmm -; I386-NOT: pcmpeqd -; I386: orps LCPI0_2, %xmm - -; X86-64: pcmpeqd -; X86-64-NOT: pcmpeqd - - %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }> - %struct._cl_image_format_t = type <{ i32, i32, i32 }> - %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }> - -define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind { -entry: - %tmp3.i = load i32* null ; <i32> [#uses=1] - %cmp = icmp sgt i32 %tmp3.i, 200 ; <i1> [#uses=1] - br i1 %cmp, label %forcond, label %ifthen - -ifthen: ; preds = %entry - ret void - -forcond: ; preds = %entry - %tmp3.i536 = load i32* null ; <i32> [#uses=1] - %cmp12 = icmp slt i32 0, %tmp3.i536 ; <i1> [#uses=1] - br i1 %cmp12, label %forbody, label %afterfor - -forbody: ; preds = %forcond - %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1] - %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1] - %mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1] - %mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1] - %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0] - %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1] - %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1] - %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1] - %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2] - %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1] - %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1] - %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1] - %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1] - %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1] - %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1] - %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1] - %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1] - %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1] - %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1] - %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1] - %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0] - %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1] - %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1] - %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1] - %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1] - %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0] - %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1] - %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1] - %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1] - %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2] - %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1] - %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1] - %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1] - %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2] - %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1] - %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1] - %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1] - %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1] - %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1] - %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1] - %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1] - call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind - unreachable - -afterfor: ; preds = %forcond - ret void -} - -declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone - -declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone - -declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone - -declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll new file mode 100644 index 0000000..a643d86 --- /dev/null +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -0,0 +1,84 @@ +; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
+; Regression test for http://reviews.llvm.org/D5701
+
+; ModuleID = 'xxhash.i'
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+; CHECK-LABEL: fn1
+; CHECK: shldl {{.*#+}} 4-byte Folded Spill
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: shldl {{.*#+}} 4-byte Folded Spill
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: addl {{.*#+}} 4-byte Folded Reload
+; CHECK: imull {{.*#+}} 4-byte Folded Reload
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: retl
+
+%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
+
+@a = common global i32 0, align 4
+@b = common global i64 0, align 8
+
+; Function Attrs: nounwind uwtable
+define i64 @fn1() #0 {
+entry:
+ %0 = load i32* @a, align 4, !tbaa !1
+ %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
+ %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
+ %2 = load i32* %total_len, align 4, !tbaa !5
+ %tobool = icmp eq i32 %2, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
+ %3 = load i64* %v3, align 4, !tbaa !8
+ %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
+ %4 = load i64* %v4, align 4, !tbaa !9
+ %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
+ %5 = load i64* %v2, align 4, !tbaa !10
+ %shl = shl i64 %5, 1
+ %or = or i64 %shl, %5
+ %shl2 = shl i64 %3, 2
+ %shr = lshr i64 %3, 1
+ %or3 = or i64 %shl2, %shr
+ %add = add i64 %or, %or3
+ %mul = mul i64 %4, -4417276706812531889
+ %shl4 = mul i64 %4, -8834553413625063778
+ %shr5 = ashr i64 %mul, 3
+ %or6 = or i64 %shr5, %shl4
+ %mul7 = mul nsw i64 %or6, 1400714785074694791
+ %xor = xor i64 %add, %mul7
+ store i64 %xor, i64* @b, align 8, !tbaa !11
+ %mul8 = mul nsw i64 %xor, 1400714785074694791
+ br label %if.end
+
+if.else: ; preds = %entry
+ %6 = load i64* @b, align 8, !tbaa !11
+ %xor10 = xor i64 %6, -4417276706812531889
+ %mul11 = mul nsw i64 %xor10, 400714785074694791
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
+ %storemerge = add i64 %storemerge.in, -8796714831421723037
+ store i64 %storemerge, i64* @b, align 8, !tbaa !11
+ ret i64 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !2, i64 0}
+!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
+!7 = metadata !{metadata !"long long", metadata !3, i64 0}
+!8 = metadata !{metadata !6, metadata !7, i64 16}
+!9 = metadata !{metadata !6, metadata !7, i64 24}
+!10 = metadata !{metadata !6, metadata !7, i64 8}
+!11 = metadata !{metadata !7, metadata !7, i64 0}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll index a973bef..e6c1e1a 100644 --- a/test/CodeGen/X86/fp-load-trunc.ll +++ b/test/CodeGen/X86/fp-load-trunc.ll @@ -2,57 +2,87 @@ ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX define <1 x float> @test1(<1 x double>* %p) nounwind { -; CHECK: test1 -; CHECK: cvtsd2ss -; CHECK: ret -; AVX: test1 -; AVX: vcvtsd2ss -; AVX: ret +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movsd (%eax), %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +; +; AVX-LABEL: test1: +; AVX: # BB#0: +; AVX-NEXT: pushl %eax +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vmovsd (%eax), %xmm0 +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%esp) +; AVX-NEXT: flds (%esp) +; AVX-NEXT: popl %eax +; AVX-NEXT: retl %x = load <1 x double>* %p %y = fptrunc <1 x double> %x to <1 x float> ret <1 x float> %y } define <2 x float> @test2(<2 x double>* %p) nounwind { -; CHECK: test2 -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: ret -; AVX: test2 -; AVX: vcvtpd2psx {{[0-9]*}}(%{{.*}}) -; AVX: ret +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 +; CHECK-NEXT: retl +; +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vcvtpd2psx (%eax), %xmm0 +; AVX-NEXT: retl %x = load <2 x double>* %p %y = fptrunc <2 x double> %x to <2 x float> ret <2 x float> %y } define <4 x float> @test3(<4 x double>* %p) nounwind { -; CHECK: test3 -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: movlhps -; CHECK: ret -; AVX: test3 -; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) -; AVX: ret +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 +; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test3: +; AVX: # BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vcvtpd2psy (%eax), %xmm0 +; AVX-NEXT: retl %x = load <4 x double>* %p %y = fptrunc <4 x double> %x to <4 x float> ret <4 x float> %y } define <8 x float> @test4(<8 x double>* %p) nounwind { -; CHECK: test4 -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: movlhps -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: movlhps -; CHECK: ret -; AVX: test4 -; AVX: vcvtpd2psy -; AVX: vcvtpd2psy -; AVX: vinsertf128 -; AVX: ret +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 +; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: cvtpd2ps 48(%eax), %xmm2 +; CHECK-NEXT: cvtpd2ps 32(%eax), %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test4: +; AVX: # BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vcvtpd2psy (%eax), %xmm0 +; AVX-NEXT: vcvtpd2psy 32(%eax), %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retl %x = load <8 x double>* %p %y = fptrunc <8 x double> %x to <8 x float> ret <8 x float> %y diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll index 25442fc..6424bfc 100644 --- a/test/CodeGen/X86/fp-trunc.ll +++ b/test/CodeGen/X86/fp-trunc.ll @@ -2,55 +2,77 @@ ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX define <1 x float> @test1(<1 x double> %x) nounwind { -; CHECK: test1 -; CHECK: cvtsd2ss -; CHECK: ret -; AVX: test1 -; AVX: vcvtsd2ss -; AVX: ret +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movsd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +; +; AVX-LABEL: test1: +; AVX: # BB#0: +; AVX-NEXT: pushl %eax +; AVX-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%esp) +; AVX-NEXT: flds (%esp) +; AVX-NEXT: popl %eax +; AVX-NEXT: retl %y = fptrunc <1 x double> %x to <1 x float> ret <1 x float> %y } define <2 x float> @test2(<2 x double> %x) nounwind { -; CHECK: test2 -; CHECK: cvtpd2ps -; CHECK: ret -; AVX: test2 -; AVX-NOT: vcvtpd2psy -; AVX: vcvtpd2ps -; AVX: ret +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX-NEXT: retl %y = fptrunc <2 x double> %x to <2 x float> ret <2 x float> %y } define <4 x float> @test3(<4 x double> %x) nounwind { -; CHECK: test3 -; CHECK: cvtpd2ps -; CHECK: cvtpd2ps -; CHECK: movlhps -; CHECK: ret -; AVX: test3 -; AVX: vcvtpd2psy -; AVX: ret +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test3: +; AVX: # BB#0: +; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retl %y = fptrunc <4 x double> %x to <4 x float> ret <4 x float> %y } define <8 x float> @test4(<8 x double> %x) nounwind { -; CHECK: test4 -; CHECK: cvtpd2ps -; CHECK: cvtpd2ps -; CHECK: movlhps -; CHECK: cvtpd2ps -; CHECK: cvtpd2ps -; CHECK: movlhps -; CHECK: ret -; AVX: test4 -; AVX: vcvtpd2psy -; AVX: vcvtpd2psy -; AVX: vinsertf128 -; AVX: ret +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: cvtpd2ps %xmm3, %xmm3 +; CHECK-NEXT: cvtpd2ps %xmm2, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test4: +; AVX: # BB#0: +; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0 +; AVX-NEXT: vcvtpd2psy %ymm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retl %y = fptrunc <8 x double> %x to <8 x float> ret <8 x float> %y } diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll new file mode 100644 index 0000000..dfc59a3 --- /dev/null +++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll @@ -0,0 +1,71 @@ +; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as + +@g1 = global double 0.000000e+00, align 8 +@g2 = global i32 0, align 4 + +define void @_Z16fpuop_arithmeticjj(i32, i32) { +entry: + switch i32 undef, label %sw.bb.i1921 [ + ] + +sw.bb261: ; preds = %entry, %entry + unreachable + +sw.bb.i1921: ; preds = %if.end504 + switch i32 undef, label %if.end511 [ + i32 1, label %sw.bb27.i + ] + +sw.bb27.i: ; preds = %sw.bb.i1921 + %conv.i.i1923 = fpext float undef to x86_fp80 + br label %if.end511 + +if.end511: ; preds = %sw.bb27.i, %sw.bb13.i + %src.sroa.0.0.src.sroa.0.0.2280 = phi x86_fp80 [ %conv.i.i1923, %sw.bb27.i ], [ undef, %sw.bb.i1921 ] + switch i32 undef, label %sw.bb992 [ + i32 3, label %sw.bb735 + i32 18, label %if.end41.i2210 + ] + +sw.bb735: ; preds = %if.end511 + %2 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280) + unreachable + +if.end41.i2210: ; preds = %if.end511 + call void @llvm.dbg.value(metadata !{x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280}, i64 0, metadata !20, metadata !{metadata !"0x102"}) + unreachable + +sw.bb992: ; preds = %if.end511 + ret void +} + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!24, !25} +!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus] +!1 = metadata !{metadata !"fpu_ieee.cpp", metadata !"x87stackifier"} +!2 = metadata !{} +!3 = metadata !{metadata !4} +!4 = metadata !{metadata !"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", metadata !5, metadata !6, metadata !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic] +!5 = metadata !{metadata !"f1.cpp", metadata !"x87stackifier"} +!6 = metadata !{metadata !"0x29", metadata !5} ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp] +!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = metadata !{null, metadata !9, metadata !9} +!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned] +!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !18, metadata !20} +!11 = metadata !{metadata !"0x101\00\0016777227\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11] +!12 = metadata !{metadata !"0x101\00\0033554443\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11] +!13 = metadata !{metadata !"0x100\00x\0014\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [x] [line 14] +!14 = metadata !{metadata !"0x16\00fpu_extended\003\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register] +!15 = metadata !{metadata !"0x16\00fpu_register\002\000\000\000\000", metadata !5, null, metadata !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64] +!16 = metadata !{metadata !"0x16\00uae_f64\001\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double] +!17 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float] +!18 = metadata !{metadata !"0x100\00a\0015\000", metadata !4, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 15] +!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!20 = metadata !{metadata !"0x100\00value\0016\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [value] [line 16] +!21 = metadata !{metadata !22, metadata !23} +!22 = metadata !{metadata !"0x34\00g1\00g1\00\005\000\001", null, metadata !6, metadata !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def] +!23 = metadata !{metadata !"0x34\00g2\00g2\00\006\000\001", null, metadata !6, metadata !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def] +!24 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} +!25 = metadata !{i32 2, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll index 6c1ca25..452c8e5 100644 --- a/test/CodeGen/X86/frameaddr.ll +++ b/test/CodeGen/X86/frameaddr.ll @@ -2,6 +2,8 @@ ; RUN: llc < %s -march=x86 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32 ; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK-64 ; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64 +; RUN: llc < %s -mtriple=x86_64-gnux32 | FileCheck %s --check-prefix=CHECK-X32ABI +; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-X32ABI define i8* @test1() nounwind { entry: @@ -17,6 +19,12 @@ entry: ; CHECK-64-NEXT: movq %rbp, %rax ; CHECK-64-NEXT: pop ; CHECK-64-NEXT: ret +; CHECK-X32ABI-LABEL: test1 +; CHECK-X32ABI: pushq %rbp +; CHECK-X32ABI-NEXT: movl %esp, %ebp +; CHECK-X32ABI-NEXT: movl %ebp, %eax +; CHECK-X32ABI-NEXT: popq %rbp +; CHECK-X32ABI-NEXT: ret %0 = tail call i8* @llvm.frameaddress(i32 0) ret i8* %0 } @@ -37,6 +45,13 @@ entry: ; CHECK-64-NEXT: movq (%rax), %rax ; CHECK-64-NEXT: pop ; CHECK-64-NEXT: ret +; CHECK-X32ABI-LABEL: test2 +; CHECK-X32ABI: pushq %rbp +; CHECK-X32ABI-NEXT: movl %esp, %ebp +; CHECK-X32ABI-NEXT: movl (%ebp), %eax +; CHECK-X32ABI-NEXT: movl (%eax), %eax +; CHECK-X32ABI-NEXT: popq %rbp +; CHECK-X32ABI-NEXT: ret %0 = tail call i8* @llvm.frameaddress(i32 2) ret i8* %0 } diff --git a/test/CodeGen/X86/gcc_except_table_functions.ll b/test/CodeGen/X86/gcc_except_table_functions.ll new file mode 100644 index 0000000..4a81680 --- /dev/null +++ b/test/CodeGen/X86/gcc_except_table_functions.ll @@ -0,0 +1,53 @@ +; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s + +; This test demonstrates that it is possible to use functions for typeinfo +; instead of global variables. While __gxx_personality_v0 would never know what +; to do with them, other EH schemes such as SEH might use them. + +declare i32 @__gxx_personality_v0(...) +declare void @filt0() +declare void @filt1() +declare void @_Z1fv() +declare i32 @llvm.eh.typeid.for(i8*) + +define i32 @main() uwtable { +entry: + invoke void @_Z1fv() + to label %try.cont unwind label %lpad + +try.cont: + ret i32 0 + +lpad: + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* bitcast (void ()* @filt0 to i8*) + catch i8* bitcast (void ()* @filt1 to i8*) + %sel = extractvalue { i8*, i32 } %0, 1 + %id0 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt0 to i8*)) + %is_f0 = icmp eq i32 %sel, %id0 + br i1 %is_f0, label %try.cont, label %check_f1 + +check_f1: + %id1 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt1 to i8*)) + %is_f1 = icmp eq i32 %sel, %id1 + br i1 %is_f1, label %try.cont, label %eh.resume + +eh.resume: + resume { i8*, i32 } %0 +} + +; CHECK-LABEL: main: +; CHECK: .cfi_startproc +; CHECK: .cfi_personality 3, __gxx_personality_v0 +; CHECK: .cfi_lsda 3, .Lexception0 +; CHECK: .cfi_def_cfa_offset 16 +; CHECK: callq _Z1fv +; CHECK: retq +; CHECK: cmpl $2, %edx +; CHECK: je +; CHECK: cmpl $1, %edx +; CHECK: je +; CHECK: callq _Unwind_Resume +; CHECK: .cfi_endproc +; CHECK: GCC_except_table0: +; CHECK: Lexception0: diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll index c763f39..fa1169d 100644 --- a/test/CodeGen/X86/global-sections.ll +++ b/test/CodeGen/X86/global-sections.ll @@ -53,21 +53,20 @@ define void @F1() { ; _Complex long long const G4 = 34; -@G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 } +@G4 = private unnamed_addr constant {i64,i64} { i64 34, i64 0 } ; DARWIN: .section __TEXT,__literal16,16byte_literals -; DARWIN: _G4: +; DARWIN: L_G4: ; DARWIN: .long 34 ; DARWIN-STATIC: .section __TEXT,__literal16,16byte_literals -; DARWIN-STATIC: _G4: +; DARWIN-STATIC: L_G4: ; DARWIN-STATIC: .long 34 ; DARWIN64: .section __TEXT,__literal16,16byte_literals -; DARWIN64: _G4: +; DARWIN64: L_G4: ; DARWIN64: .quad 34 - ; int G5 = 47; @G5 = global i32 47 @@ -194,3 +193,23 @@ define void @F1() { ; WIN32-SECTIONS: L_G14: ; WIN32-SECTIONS: .asciz "foo" +; cannot be merged on MachO, but can on other formats. +@G15 = unnamed_addr constant i64 0 + +; LINUX: .section .rodata.cst8,"aM",@progbits,8 +; LINUX: G15: + +; DARWIN: .section __TEXT,__const +; DARWIN: _G15: + +; DARWIN-STATIC: .section __TEXT,__const +; DARWIN-STATIC: _G15: + +; DARWIN64: .section __TEXT,__const +; DARWIN64: _G15: + +; LINUX-SECTIONS: .section .rodata.G15,"aM",@progbits,8 +; LINUX-SECTIONS: G15: + +; WIN32-SECTIONS: .section .rdata,"rd",one_only,_G15 +; WIN32-SECTIONS: _G15: diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll new file mode 100644 index 0000000..1dcf939 --- /dev/null +++ b/test/CodeGen/X86/half.ll @@ -0,0 +1,69 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C + +define void @test_load_store(half* %in, half* %out) { +; CHECK-LABEL: test_load_store: +; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]] +; CHECK: movw [[TMP]], (%rsi) + %val = load half* %in + store half %val, half* %out + ret void +} + +define i16 @test_bitcast_from_half(half* %addr) { +; CHECK-LABEL: test_bitcast_from_half: +; CHECK: movzwl (%rdi), %eax + %val = load half* %addr + %val_int = bitcast half %val to i16 + ret i16 %val_int +} + +define void @test_bitcast_to_half(half* %addr, i16 %in) { +; CHECK-LABEL: test_bitcast_to_half: +; CHECK: movw %si, (%rdi) + %val_fp = bitcast i16 %in to half + store half %val_fp, half* %addr + ret void +} + +define float @test_extend32(half* %addr) { +; CHECK-LABEL: test_extend32: + +; CHECK-LIBCALL: jmp __gnu_h2f_ieee +; CHECK-FP16: vcvtph2ps + %val16 = load half* %addr + %val32 = fpext half %val16 to float + ret float %val32 +} + +define double @test_extend64(half* %addr) { +; CHECK-LABEL: test_extend64: + +; CHECK-LIBCALL: callq __gnu_h2f_ieee +; CHECK-LIBCALL: cvtss2sd +; CHECK-FP16: vcvtph2ps +; CHECK-FP16: vcvtss2sd + %val16 = load half* %addr + %val32 = fpext half %val16 to double + ret double %val32 +} + +define void @test_trunc32(float %in, half* %addr) { +; CHECK-LABEL: test_trunc32: + +; CHECK-LIBCALL: callq __gnu_f2h_ieee +; CHECK-FP16: vcvtps2ph + %val16 = fptrunc float %in to half + store half %val16, half* %addr + ret void +} + +define void @test_trunc64(double %in, half* %addr) { +; CHECK-LABEL: test_trunc64: + +; CHECK-LIBCALL: callq __truncdfhf2 +; CHECK-FP16: callq __truncdfhf2 + %val16 = fptrunc double %in to half + store half %val16, half* %addr + ret void +} diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll deleted file mode 100644 index ba846f3..0000000 --- a/test/CodeGen/X86/i8-umulo.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s -; PR19858 - -declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b) -define i8 @testumulo(i32 %argc) { -; CHECK: imulw -; CHECK: testb %{{.+}}, %{{.+}} -; CHECK: je [[NOOVERFLOWLABEL:.+]] -; CHECK: {{.*}}[[NOOVERFLOWLABEL]]: -; CHECK-NEXT: movb -; CHECK-NEXT: retl -top: - %RHS = trunc i32 %argc to i8 - %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS) - %ex = extractvalue { i8, i1 } %umul, 1 - br i1 %ex, label %overflow, label %nooverlow - -overflow: - ret i8 %RHS - -nooverlow: - %umul.value = extractvalue { i8, i1 } %umul, 0 - ret i8 %umul.value -} diff --git a/test/CodeGen/X86/inalloca-regparm.ll b/test/CodeGen/X86/inalloca-regparm.ll new file mode 100644 index 0000000..9dd916b --- /dev/null +++ b/test/CodeGen/X86/inalloca-regparm.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=i686-windows-msvc < %s -o /dev/null +; RUN: not llc -mtriple=x86_64-windows-msvc %s -o /dev/null 2>&1 | FileCheck %s + +; This will compile successfully on x86 but not x86_64, because %b will become a +; register parameter. + +declare x86_thiscallcc i32 @f(i32 %a, i32* inalloca %b) +define void @g() { + %b = alloca inalloca i32 + store i32 2, i32* %b + call x86_thiscallcc i32 @f(i32 0, i32* inalloca %b) + ret void +} + +; CHECK: cannot use inalloca attribute on a register parameter diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll index 91c477b..bb3778a 100644 --- a/test/CodeGen/X86/inline-asm-fpstack.ll +++ b/test/CodeGen/X86/inline-asm-fpstack.ll @@ -340,3 +340,65 @@ entry: %0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind ret i32 %0 } + +; <rdar://problem/16952634> +; X87 stackifier asserted when there was an ST register defined by an +; inline-asm instruction and the ST register was live across another +; inline-asm instruction. +; +; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead> +; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead> +; %FP0<def> = COPY %ST0 + +; CHECK-LABEL: _test_live_st +; CHECK: ## InlineAsm Start +; CHECK: frndint +; CHECK: ## InlineAsm End +; CHECK: ## InlineAsm Start +; CHECK: fldcw +; CHECK: ## InlineAsm End + +%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] } +%struct.anon1 = type { i32, i32, i32 } +%struct.anon2 = type { i32, i32, i32, i32 } + +@fpu = external global %struct.fpu_t, align 16 + +; Function Attrs: ssp +define void @test_live_st(i32 %a1) { +entry: + %0 = load x86_fp80* undef, align 16 + %cond = icmp eq i32 %a1, 1 + br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit + +sw.bb4.i: + %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0) + call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef) + br label %_Z5tointRKe.exit + +_Z5tointRKe.exit: + %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ] + %conv.i1814 = fptosi x86_fp80 %result.0.i to i32 + %conv626 = sitofp i32 %conv.i1814 to x86_fp80 + store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1) + br label %return + +return: + ret void +} + +; Check that x87 stackifier is correctly rewriting FP registers to ST registers. +; +; CHECK-LABEL: _test_operand_rewrite +; CHECK: ## InlineAsm Start +; CHECK: foo %st(0), %st(1) +; CHECK: ## InlineAsm End + +define double @test_operand_rewrite() { +entry: + %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"() + %asmresult = extractvalue { double, double } %0, 0 + %asmresult1 = extractvalue { double, double } %0, 1 + %sub = fsub double %asmresult, %asmresult1 + ret double %sub +} diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll index d417453..dfa8aed 100644 --- a/test/CodeGen/X86/jump_sign.ll +++ b/test/CodeGen/X86/jump_sign.ll @@ -284,7 +284,7 @@ entry: define i32 @func_test1(i32 %p1) nounwind uwtable { entry: ; CHECK-LABEL: func_test1: -; CHECK: testb +; CHECK: andb ; CHECK: j ; CHECK: ret %0 = load i32* @b, align 4 diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll index f3691fd..2062200 100644 --- a/test/CodeGen/X86/jump_table_alias.ll +++ b/test/CodeGen/X86/jump_table_alias.ll @@ -5,7 +5,7 @@ entry: ret i32 0 } -@i = alias internal i32 ()* @f +@i = internal alias i32 ()* @f @j = alias i32 ()* @f define i32 @main(i32 %argc, i8** %argv) { @@ -25,7 +25,6 @@ define i32 @main(i32 %argc, i8** %argv) { ; There should only be one table, even though there are two GlobalAliases, ; because they both alias the same value. -; CHECK: .globl __llvm_jump_instr_table_0_1 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_1,@function ; CHECK: __llvm_jump_instr_table_0_1: diff --git a/test/CodeGen/X86/jump_table_align.ll b/test/CodeGen/X86/jump_table_align.ll new file mode 100644 index 0000000..6ad48d1 --- /dev/null +++ b/test/CodeGen/X86/jump_table_align.ll @@ -0,0 +1,29 @@ +; RUN: llc -filetype=obj <%s -jump-table-type=single -o %t1 +; RUN: llvm-objdump -triple=x86_64-unknown-linux-gnu -d %t1 | FileCheck %s +target triple = "x86_64-unknown-linux-gnu" +define i32 @f() unnamed_addr jumptable { + ret i32 0 +} + +define i32 @g(i8* %a) unnamed_addr jumptable { + ret i32 0 +} + +define void @h(void ()* %func) unnamed_addr jumptable { + ret void +} + +define i32 @main() { + %g = alloca i32 (...)*, align 8 + store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8 + %1 = load i32 (...)** %g, align 8 + %call = call i32 (...)* %1() + call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*)) + %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null) + ret i32 %a +} + +; Make sure that the padding from getJumpInstrTableEntryBound is right. +; CHECK: __llvm_jump_instr_table_0_1: +; CHECK-NEXT: e9 00 00 00 00 jmp 0 +; CHECK-NEXT: 0f 1f 00 nopl (%rax) diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll index 33a798f..749b77a 100644 --- a/test/CodeGen/X86/jump_table_bitcast.ll +++ b/test/CodeGen/X86/jump_table_bitcast.ll @@ -15,12 +15,12 @@ define void @h(void ()* %func) unnamed_addr jumptable { define i32 @main() { %g = alloca i32 (...)*, align 8 store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8 -; CHECK: movq $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp) -; CHECK: movl $__llvm_jump_instr_table_0_[[ENTRY]], %ecx +; CHECK: movq $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], +; CHECK: movl $__llvm_jump_instr_table_0_[[ENTRY]], %1 = load i32 (...)** %g, align 8 %call = call i32 (...)* %1() call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*)) -; CHECK: movl $__llvm_jump_instr_table_0_{{1|2|3}}, %edi +; CHECK: movl $__llvm_jump_instr_table_0_{{1|2|3}}, ; CHECK: callq h %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null) @@ -28,17 +28,14 @@ define i32 @main() { ret i32 %a } -; CHECK: .globl __llvm_jump_instr_table_0_1 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_1,@function ; CHECK: __llvm_jump_instr_table_0_1: ; CHECK: jmp {{f|g|h}}@PLT -; CHECK: .globl __llvm_jump_instr_table_0_2 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_2,@function ; CHECK: __llvm_jump_instr_table_0_2: ; CHECK: jmp {{f|g|h}}@PLT -; CHECK: .globl __llvm_jump_instr_table_0_3 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_3,@function ; CHECK: __llvm_jump_instr_table_0_3: diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll index 5a0aed0..485154e 100644 --- a/test/CodeGen/X86/jump_tables.ll +++ b/test/CodeGen/X86/jump_tables.ll @@ -7,6 +7,20 @@ target triple = "x86_64-unknown-linux-gnu" %struct.fun_struct = type { i32 (...)* } +@a = global [12 x i32 () *] [ i32 ()* bitcast (void ()* @indirect_fun to i32 ()*), + i32 ()* bitcast (void ()* @indirect_fun_match to i32 ()*), + i32 ()* bitcast (i32 ()* @indirect_fun_i32 to i32 ()*), + i32 ()* bitcast (i32 (i32)* @indirect_fun_i32_1 to i32 ()*), + i32 ()* bitcast (i32 (i32, i32)* @indirect_fun_i32_2 to i32 ()*), + i32 ()* bitcast (i32* (i32*, i32)* @indirect_fun_i32S_2 to i32 ()*), + i32 ()* bitcast (void (%struct.fun_struct)* @indirect_fun_struct to i32 ()*), + i32 ()* bitcast (void (i32 (...)*, i32)* @indirect_fun_fun to i32 ()*), + i32 ()* bitcast (i32 (i32 (...)*, i32)* @indirect_fun_fun_ret to i32 ()*), + i32 ()* bitcast (void ([19 x i8])* @indirect_fun_array to i32 ()*), + i32 ()* bitcast (void (<3 x i32>)* @indirect_fun_vec to i32 ()*), + i32 ()* bitcast (void (<4 x float>)* @indirect_fun_vec_2 to i32 ()*) + ] + define void @indirect_fun() unnamed_addr jumptable { ret void } @@ -74,62 +88,50 @@ define i32 @main(i32 %argc, i8** %argv) { ret i32 %a } -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_1 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_1,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_1: ; SINGLE-DAG: jmp indirect_fun_array@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_2 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_2,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_2: ; SINGLE-DAG: jmp indirect_fun_i32_2@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_3 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_3,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_3: ; SINGLE-DAG: jmp indirect_fun_vec_2@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_4 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_4,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_4: ; SINGLE-DAG: jmp indirect_fun_i32S_2@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_5 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_5,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_5: ; SINGLE-DAG: jmp indirect_fun_struct@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_6 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_6,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_6: ; SINGLE-DAG: jmp indirect_fun_i32_1@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_7 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_7,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_7: ; SINGLE-DAG: jmp indirect_fun_i32@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_8 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_8,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_8: ; SINGLE-DAG: jmp indirect_fun_fun@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_9 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_9,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_9: ; SINGLE-DAG: jmp indirect_fun_fun_ret@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_10 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_10,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_10: ; SINGLE-DAG: jmp indirect_fun@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_11 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_11,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_11: ; SINGLE-DAG: jmp indirect_fun_match@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_12 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_12,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_12: @@ -144,82 +146,69 @@ define i32 @main(i32 %argc, i8** %argv) { ; SINGLE-DAG: ud2 -; ARITY-DAG: .globl __llvm_jump_instr_table_2_1 ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: .type __llvm_jump_instr_table_2_1,@function ; ARITY-DAG: __llvm_jump_instr_table_2_1: ; ARITY-DAG: jmp indirect_fun{{.*}}@PLT ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: ud2 -; ARITY-DAG: .globl __llvm_jump_instr_table_0_1 ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: .type __llvm_jump_instr_table_0_1,@function ; ARITY-DAG: __llvm_jump_instr_table_0_1: ; ARITY-DAG: jmp indirect_fun{{.*}}@PLT -; ARITY-DAG: .globl __llvm_jump_instr_table_1_1 ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: .type __llvm_jump_instr_table_1_1,@function ; ARITY-DAG: __llvm_jump_instr_table_1_1: ; ARITY-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_2_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_2_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_2_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: ud2 -; SIMPL-DAG: .globl __llvm_jump_instr_table_0_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_0_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_0_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_1_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_1_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_1_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_3_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_3_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_3_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_4_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_4_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_4_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; FULL-DAG: .globl __llvm_jump_instr_table_10_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_10_1,@function ; FULL-DAG:__llvm_jump_instr_table_10_1: ; FULL-DAG: jmp indirect_fun_i32_1@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_9_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_9_1,@function ; FULL-DAG:__llvm_jump_instr_table_9_1: ; FULL-DAG: jmp indirect_fun_i32_2@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_7_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_7_1,@function ; FULL-DAG:__llvm_jump_instr_table_7_1: ; FULL-DAG: jmp indirect_fun_i32S_2@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_3_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_3_1,@function ; FULL-DAG:__llvm_jump_instr_table_3_1: ; FULL-DAG: jmp indirect_fun_vec_2@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_2_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_2_1,@function ; FULL-DAG:__llvm_jump_instr_table_2_1: @@ -228,42 +217,36 @@ define i32 @main(i32 %argc, i8** %argv) { ; FULL-DAG: ud2 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_8_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_8_1,@function ; FULL-DAG:__llvm_jump_instr_table_8_1: ; FULL-DAG: jmp indirect_fun_i32@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_1_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_1_1,@function ; FULL-DAG:__llvm_jump_instr_table_1_1: ; FULL-DAG: jmp indirect_fun_array@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_0_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_0_1,@function ; FULL-DAG:__llvm_jump_instr_table_0_1: ; FULL-DAG: jmp indirect_fun_vec@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_6_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_6_1,@function ; FULL-DAG:__llvm_jump_instr_table_6_1: ; FULL-DAG: jmp indirect_fun_struct@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_5_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_5_1,@function ; FULL-DAG:__llvm_jump_instr_table_5_1: ; FULL-DAG: jmp indirect_fun_fun@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_4_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_4_1,@function ; FULL-DAG:__llvm_jump_instr_table_4_1: diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll index 82cefb7..6fb3879 100644 --- a/test/CodeGen/X86/lea-2.ll +++ b/test/CodeGen/X86/lea-2.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl -x86-asm-syntax=intel | FileCheck %s define i32 @test1(i32 %A, i32 %B) { %tmp1 = shl i32 %A, 2 diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll index c439ee1..a56403a 100644 --- a/test/CodeGen/X86/lea-3.ll +++ b/test/CodeGen/X86/lea-3.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s ; CHECK: leaq (,[[A0:%rdi|%rcx]],4), %rax diff --git a/test/CodeGen/X86/lea-4.ll b/test/CodeGen/X86/lea-4.ll index cef4726..00c2278 100644 --- a/test/CodeGen/X86/lea-4.ll +++ b/test/CodeGen/X86/lea-4.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s + define zeroext i16 @t1(i32 %on_off) nounwind { entry: diff --git a/test/CodeGen/X86/lea-5.ll b/test/CodeGen/X86/lea-5.ll new file mode 100644 index 0000000..50d3aaf --- /dev/null +++ b/test/CodeGen/X86/lea-5.ll @@ -0,0 +1,59 @@ +; test for more complicated forms of lea operands which can be generated +; in loop optimized cases. +; See also http://llvm.org/bugs/show_bug.cgi?id=20016 + +; RUN: llc < %s -mtriple=x86_64-linux -O2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O2 | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-nacl -O2 | FileCheck %s -check-prefix=X32 + +; Function Attrs: nounwind readnone uwtable +define void @foo(i32 %x, i32 %d) #0 { +entry: + %a = alloca [8 x i32], align 16 + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ] + %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0 + +; CHECK: leaq -40(%rsp,%r{{[^,]*}},4), %rax +; X32: leal -40(%rsp,%r{{[^,]*}},4), %eax + %0 = load i32* %arrayidx, align 4 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %d.addr.0, 1 + +; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}} +; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}} + br i1 %cmp1, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; The same test as above but with enforsed stack realignment (%a aligned by 64) +; to check one more case of correct lea generation. + +; Function Attrs: nounwind readnone uwtable +define void @bar(i32 %x, i32 %d) #0 { +entry: + %a = alloca [8 x i32], align 64 + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ] + %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0 + +; CHECK: leaq (%rsp,%r{{[^,]*}},4), %rax +; X32: leal (%rsp,%r{{[^,]*}},4), %eax + %0 = load i32* %arrayidx, align 4 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %d.addr.0, 1 + +; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}} +; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}} + br i1 %cmp1, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret void +} + diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll index 93cfe46..9b6632c 100644 --- a/test/CodeGen/X86/lea.ll +++ b/test/CodeGen/X86/lea.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s define i32 @test1(i32 %x) nounwind { %tmp1 = shl i32 %x, 3 diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll deleted file mode 100644 index 5bbd41d..0000000 --- a/test/CodeGen/X86/long-extend.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s -define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind { -; CHECK-LABEL: test_long_extend -; CHECK: vpunpcklbw %xmm1, %xmm0, [[REG1:%xmm[0-9]+]] -; CHECK: vpunpckhwd %xmm1, [[REG1]], [[REG2:%xmm[0-9]+]] -; CHECK: vpunpcklwd %xmm1, [[REG1]], %x[[REG3:mm[0-9]+]] -; CHECK: vinsertf128 $1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]] -; CHECK: vpunpckhbw %xmm1, %xmm0, [[REG4:%xmm[0-9]+]] -; CHECK: vpunpckhwd %xmm1, [[REG4]], [[REG5:%xmm[0-9]+]] -; CHECK: vpunpcklwd %xmm1, [[REG4]], %x[[REG6:mm[0-9]+]] -; CHECK: vinsertf128 $1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]] -; CHECK: vmovaps [[REG_result1]], 32(%rdi) -; CHECK: vmovaps [[REG_result0]], (%rdi) - - %tmp = zext <16 x i8> %a to <16 x i32> - store <16 x i32> %tmp, <16 x i32>*%p - ret void -} diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll index 1d04276..c36047c 100644 --- a/test/CodeGen/X86/loop-strength-reduce8.ll +++ b/test/CodeGen/X86/loop-strength-reduce8.ll @@ -1,6 +1,9 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s -; CHECK: leal 16(%eax), %edx +; FIXME: The first two instructions, movl and addl, should have been combined to +; "leal 16(%eax), %edx" by the backend (PR20776). +; CHECK: movl %eax, %edx +; CHECK: addl $16, %edx ; CHECK: align ; CHECK: addl $4, %edx ; CHECK: decl %ecx diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index f47161e..edb8433e 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -68,13 +68,13 @@ define i64 @test4(i64 %A) { %2 = bitcast <2 x i32> %add to i64 ret i64 %2 } -; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd. +; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd. ; Ideally, we should fold that sequence into a single paddd. This is fixed with ; the widening legalization. ; ; CHECK-LABEL: test4 ; CHECK: pshufd -; CHECK-NEXT: paddq +; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK: ret ; diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll new file mode 100644 index 0000000..dd7f396 --- /dev/null +++ b/test/CodeGen/X86/mem-intrin-base-reg.ll @@ -0,0 +1,100 @@ +; RUN: llc -mtriple=i686-windows -mattr=+sse2 < %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686-pc-windows-msvc" + +; There is a conflict between lowering the X86 memory intrinsics and the "base" +; register used to address stack locals. See X86RegisterInfo::hasBaseRegister +; for when this is necessary. Typically, we chose ESI for the base register, +; which all of the X86 string instructions use. + +; The pattern of vector icmp and extractelement is used in these tests because +; it forces creation of an aligned stack temporary. Perhaps such temporaries +; shouldn't be aligned. + +declare void @escape_vla_and_icmp(i8*, i1 zeroext) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) + +define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) { + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false) + br i1 %cond, label %spill_vectors, label %no_vectors + +no_vectors: + ret i32 0 + +spill_vectors: + %vp1 = getelementptr <4 x i32>* %vp0, i32 1 + %v0 = load <4 x i32>* %vp0 + %v1 = load <4 x i32>* %vp1 + %vicmp = icmp slt <4 x i32> %v0, %v1 + %icmp = extractelement <4 x i1> %vicmp, i32 0 + call void @escape_vla_and_icmp(i8* null, i1 zeroext %icmp) + %r = extractelement <4 x i32> %v0, i32 0 + ret i32 %r +} + +; CHECK-LABEL: _memcpy_novla_vector: +; CHECK: andl $-16, %esp +; CHECK-DAG: movl $32, %ecx +; CHECK-DAG: movl {{.*}}, %esi +; CHECK-DAG: movl {{.*}}, %edi +; CHECK: rep;movsl + +define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) { + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false) + br i1 %cond, label %spill_vectors, label %no_vectors + +no_vectors: + ret i32 0 + +spill_vectors: + %vp1 = getelementptr <4 x i32>* %vp0, i32 1 + %v0 = load <4 x i32>* %vp0 + %v1 = load <4 x i32>* %vp1 + %vicmp = icmp slt <4 x i32> %v0, %v1 + %icmp = extractelement <4 x i1> %vicmp, i32 0 + %vla = alloca i8, i32 %n + call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp) + %r = extractelement <4 x i32> %v0, i32 0 + ret i32 %r +} + +; CHECK-LABEL: _memcpy_vla_vector: +; CHECK: andl $-16, %esp +; CHECK: movl %esp, %esi +; CHECK: movl $128, {{.*}}(%esp) +; CHECK: calll _memcpy +; CHECK: calll __chkstk + +; stosd doesn't clobber esi, so we can use it. + +define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) { + call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false) + br i1 %cond, label %spill_vectors, label %no_vectors + +no_vectors: + ret i32 0 + +spill_vectors: + %vp1 = getelementptr <4 x i32>* %vp0, i32 1 + %v0 = load <4 x i32>* %vp0 + %v1 = load <4 x i32>* %vp1 + %vicmp = icmp slt <4 x i32> %v0, %v1 + %icmp = extractelement <4 x i1> %vicmp, i32 0 + %vla = alloca i8, i32 %n + call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp) + %r = extractelement <4 x i32> %v0, i32 0 + ret i32 %r +} + +; CHECK-LABEL: _memset_vla_vector: +; CHECK: andl $-16, %esp +; CHECK: movl %esp, %esi +; CHECK-DAG: movl $707406378, %eax # imm = 0x2A2A2A2A +; CHECK-DAG: movl $32, %ecx +; CHECK-DAG: movl {{.*}}, %edi +; CHECK-NOT: movl {{.*}}, %esi +; CHECK: rep;stosl + +; Add a test for memcmp if we ever add a special lowering for it. diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll index 0015df0..ea38b95 100644 --- a/test/CodeGen/X86/mem-promote-integers.ll +++ b/test/CodeGen/X86/mem-promote-integers.ll @@ -1,8 +1,8 @@ ; Test the basic functionality of integer element promotions of different types. ; This tests checks passing of arguments, loading and storing to memory and ; basic arithmetic. -; RUN: llc -march=x86 < %s -; RUN: llc -march=x86-64 < %s +; RUN: llc -march=x86 < %s > /dev/null +; RUN: llc -march=x86-64 < %s > /dev/null define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) { %bb = load <1 x i8>* %b diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll index 3ea6512..5454b7c 100644 --- a/test/CodeGen/X86/misched-matmul.ll +++ b/test/CodeGen/X86/misched-matmul.ll @@ -10,7 +10,7 @@ ; more complex cases. ; ; CHECK: @wrap_mul4 -; CHECK: 22 regalloc - Number of spills inserted +; CHECK: 23 regalloc - Number of spills inserted define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 { entry: diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll index 71b0723..96c5dbb 100644 --- a/test/CodeGen/X86/movgs.ll +++ b/test/CodeGen/X86/movgs.ll @@ -3,40 +3,58 @@ ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64 define i32 @test1() nounwind readonly { +; X32-LABEL: test1: +; X32: # BB#0: # %entry +; X32-NEXT: movl %gs:196, %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: # %entry +; X64-NEXT: movq %gs:320, %rax +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: retq entry: %tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31) ; <i32*> [#uses=1] %tmp1 = load i32* %tmp ; <i32> [#uses=1] ret i32 %tmp1 } -; X32-LABEL: test1: -; X32: movl %gs:196, %eax -; X32: movl (%eax), %eax -; X32: ret - -; X64-LABEL: test1: -; X64: movq %gs:320, %rax -; X64: movl (%rax), %eax -; X64: ret define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind { +; X32-LABEL: test2: +; X32: # BB#0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: calll *%gs:(%eax) +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: addl $12, %esp +; X32-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: # %entry +; X64-NEXT: {{(subq.*%rsp|pushq)}} +; X64-NEXT: callq *%gs:(%{{(rcx|rdi)}}) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: {{(addq.*%rsp|popq)}} +; X64-NEXT: retq entry: %tmp9 = load void (i8*)* addrspace(256)* %tmp8, align 8 tail call void %tmp9(i8* undef) nounwind optsize ret i64 0 } -; rdar://8453210 -; X32-LABEL: test2: -; X32: movl {{.*}}(%esp), %eax -; X32: calll *%gs:(%eax) - -; X64-LABEL: test2: -; X64: callq *%gs:([[A0:%rdi|%rcx]]) - - - - define <2 x i64> @pmovsxwd_1(i64 addrspace(256)* %p) nounwind readonly { +; X32-LABEL: pmovsxwd_1: +; X32: # BB#0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pmovsxwd %gs:(%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovsxwd_1: +; X64: # BB#0: # %entry +; X64-NEXT: pmovsxwd %gs:(%{{(rcx|rdi)}}), %xmm0 +; X64-NEXT: retq entry: %0 = load i64 addrspace(256)* %p %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 @@ -44,20 +62,26 @@ entry: %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone %3 = bitcast <4 x i32> %2 to <2 x i64> ret <2 x i64> %3 - -; X32-LABEL: pmovsxwd_1: -; X32: movl 4(%esp), %eax -; X32: pmovsxwd %gs:(%eax), %xmm0 -; X32: ret - -; X64-LABEL: pmovsxwd_1: -; X64: pmovsxwd %gs:([[A0]]), %xmm0 -; X64: ret } ; The two loads here both look identical to selection DAG, except for their ; address spaces. Make sure they aren't CSE'd. define i32 @test_no_cse() nounwind readonly { +; X32-LABEL: test_no_cse: +; X32: # BB#0: # %entry +; X32-NEXT: movl %gs:196, %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: movl %fs:196, %ecx +; X32-NEXT: addl (%ecx), %eax +; X32-NEXT: retl +; +; X64-LABEL: test_no_cse: +; X64: # BB#0: # %entry +; X64-NEXT: movq %gs:320, %rax +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: movq %fs:320, %rcx +; X64-NEXT: addl (%rcx), %eax +; X64-NEXT: retq entry: %tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31) ; <i32*> [#uses=1] %tmp1 = load i32* %tmp ; <i32> [#uses=1] @@ -66,9 +90,5 @@ entry: %tmp4 = add i32 %tmp1, %tmp3 ret i32 %tmp4 } -; X32-LABEL: test_no_cse: -; X32: movl %gs:196 -; X32: movl %fs:196 -; X32: ret declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll index 6910515..f0bdbba 100644 --- a/test/CodeGen/X86/ms-inline-asm.ll +++ b/test/CodeGen/X86/ms-inline-asm.ll @@ -110,7 +110,7 @@ define i32 @t31() { entry: %val = alloca i32, align 64 store i32 -1, i32* %val, align 64 - call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1 + call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) %sp = load i32* %val, align 64 ret i32 %sp ; CHECK-LABEL: t31: @@ -125,3 +125,12 @@ entry: ; CHECK: movl (%esp), %eax ; CHECK: ret } + +declare hidden void @other_func() + +define void @naked() #0 { + call void asm sideeffect inteldialect "call dword ptr $0", "*m,~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{esp},~{ebp},~{dirflag},~{fpsr},~{flags}"(void()* @other_func) + unreachable +} + +attributes #0 = { naked } diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll new file mode 100644 index 0000000..1e99c14 --- /dev/null +++ b/test/CodeGen/X86/musttail-varargs.ll @@ -0,0 +1,119 @@ +; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS + +; Test that we actually spill and reload all arguments in the variadic argument +; pack. Doing a normal call will clobber all argument registers, and we will +; spill around it. A simple adjustment should not require any XMM spills. + +declare void(i8*, ...)* @get_f(i8* %this) + +define void @f_thunk(i8* %this, ...) { + %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this) + musttail call void (i8*, ...)* %fptr(i8* %this, ...) + ret void +} + +; Save and restore 6 GPRs, 8 XMMs, and AL around the call. + +; LINUX-LABEL: f_thunk: +; LINUX-DAG: movq %rdi, {{.*}} +; LINUX-DAG: movq %rsi, {{.*}} +; LINUX-DAG: movq %rdx, {{.*}} +; LINUX-DAG: movq %rcx, {{.*}} +; LINUX-DAG: movq %r8, {{.*}} +; LINUX-DAG: movq %r9, {{.*}} +; LINUX-DAG: movb %al, {{.*}} +; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp) +; LINUX: callq get_f +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7 +; LINUX-DAG: movq {{.*}}, %rdi +; LINUX-DAG: movq {{.*}}, %rsi +; LINUX-DAG: movq {{.*}}, %rdx +; LINUX-DAG: movq {{.*}}, %rcx +; LINUX-DAG: movq {{.*}}, %r8 +; LINUX-DAG: movq {{.*}}, %r9 +; LINUX-DAG: movb {{.*}}, %al +; LINUX: jmpq *{{.*}} # TAILCALL + +; WINDOWS-LABEL: f_thunk: +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS-DAG: movq %rdx, {{.*}} +; WINDOWS-DAG: movq %rcx, {{.*}} +; WINDOWS-DAG: movq %r8, {{.*}} +; WINDOWS-DAG: movq %r9, {{.*}} +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS: callq get_f +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS-DAG: movq {{.*}}, %rdx +; WINDOWS-DAG: movq {{.*}}, %rcx +; WINDOWS-DAG: movq {{.*}}, %r8 +; WINDOWS-DAG: movq {{.*}}, %r9 +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS: jmpq *{{.*}} # TAILCALL + +; This thunk shouldn't require any spills and reloads, assuming the register +; allocator knows what it's doing. + +define void @g_thunk(i8* %fptr_i8, ...) { + %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)* + musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...) + ret void +} + +; LINUX-LABEL: g_thunk: +; LINUX-NOT: movq +; LINUX: jmpq *%rdi # TAILCALL + +; WINDOWS-LABEL: g_thunk: +; WINDOWS-NOT: movq +; WINDOWS: jmpq *%rcx # TAILCALL + +; Do a simple multi-exit multi-bb test. + +%struct.Foo = type { i1, i8*, i8* } + +@g = external global i32 + +define void @h_thunk(%struct.Foo* %this, ...) { + %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0 + %cond = load i1* %cond_p + br i1 %cond, label %then, label %else + +then: + %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1 + %a_i8 = load i8** %a_p + %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)* + musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...) + ret void + +else: + %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2 + %b_i8 = load i8** %b_p + %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)* + store i32 42, i32* @g + musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...) + ret void +} + +; LINUX-LABEL: h_thunk: +; LINUX: jne +; LINUX: jmpq *{{.*}} # TAILCALL +; LINUX: jmpq *{{.*}} # TAILCALL +; WINDOWS-LABEL: h_thunk: +; WINDOWS: jne +; WINDOWS: jmpq *{{.*}} # TAILCALL +; WINDOWS: jmpq *{{.*}} # TAILCALL diff --git a/test/CodeGen/X86/nancvt.ll b/test/CodeGen/X86/nancvt.ll index 8036710..8a665fa 100644 --- a/test/CodeGen/X86/nancvt.ll +++ b/test/CodeGen/X86/nancvt.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -std-compile-opts | llc > %t +; RUN: opt < %s -O3 | llc > %t ; RUN: grep 2147027116 %t | count 3 ; RUN: grep 2147228864 %t | count 3 ; RUN: grep 2146502828 %t | count 3 diff --git a/test/CodeGen/X86/narrow-shl-load.ll b/test/CodeGen/X86/narrow-shl-load.ll index 30387925..5175bfc 100644 --- a/test/CodeGen/X86/narrow-shl-load.ll +++ b/test/CodeGen/X86/narrow-shl-load.ll @@ -30,40 +30,6 @@ while.end: ; preds = %while.cond ret void } - -; DAGCombiner shouldn't fold the sdiv (ashr) away. -; rdar://8636812 -; CHECK-LABEL: test2: -; CHECK: sarl - -define i32 @test2() nounwind { -entry: - %i = alloca i32, align 4 - %j = alloca i8, align 1 - store i32 127, i32* %i, align 4 - store i8 0, i8* %j, align 1 - %tmp3 = load i32* %i, align 4 - %mul = mul nsw i32 %tmp3, 2 - %conv4 = trunc i32 %mul to i8 - %conv5 = sext i8 %conv4 to i32 - %div6 = sdiv i32 %conv5, 2 - %conv7 = trunc i32 %div6 to i8 - %conv9 = sext i8 %conv7 to i32 - %cmp = icmp eq i32 %conv9, -1 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - ret i32 0 - -if.end: ; preds = %entry - call void @abort() noreturn - unreachable -} - -declare void @abort() noreturn - -declare void @exit(i32) noreturn - ; DAG Combiner can't fold this into a load of the 1'th byte. ; PR8757 define i32 @test3(i32 *%P) nounwind ssp { diff --git a/test/CodeGen/X86/nonconst-static-ev.ll b/test/CodeGen/X86/nonconst-static-ev.ll index f852cae..5449791 100644 --- a/test/CodeGen/X86/nonconst-static-ev.ll +++ b/test/CodeGen/X86/nonconst-static-ev.ll @@ -1,6 +1,5 @@ ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s -; REQUIRES: shell @0 = global i8 extractvalue ([1 x i8] select (i1 ptrtoint (i32* @1 to i1), [1 x i8] [ i8 1 ], [1 x i8] [ i8 2 ]), 0) @1 = external global i32 diff --git a/test/CodeGen/X86/nonconst-static-iv.ll b/test/CodeGen/X86/nonconst-static-iv.ll index 8fad39b..30613ef 100644 --- a/test/CodeGen/X86/nonconst-static-iv.ll +++ b/test/CodeGen/X86/nonconst-static-iv.ll @@ -1,6 +1,5 @@ ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s -; REQUIRES: shell @0 = global i8 insertvalue( { i8 } select (i1 ptrtoint (i32* @1 to i1), { i8 } { i8 1 }, { i8 } { i8 2 }), i8 0, 0) @1 = external global i32 diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll new file mode 100644 index 0000000..9d0cb9a --- /dev/null +++ b/test/CodeGen/X86/nontemporal-2.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX + + +; Make sure that we generate non-temporal stores for the test cases below. + +define void @test1(<4 x float>* %dst) { +; CHECK-LABEL: test1: +; SSE: movntps +; AVX: vmovntps + store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 + ret void +} + +define void @test2(<4 x i32>* %dst) { +; CHECK-LABEL: test2: +; SSE: movntps +; AVX: vmovntps + store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 + ret void +} + +define void @test3(<2 x double>* %dst) { +; CHECK-LABEL: test3: +; SSE: movntps +; AVX: vmovntps + store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 + ret void +} + +!1 = metadata !{i32 1} diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll index fa77fcb..b559729 100644 --- a/test/CodeGen/X86/null-streamer.ll +++ b/test/CodeGen/X86/null-streamer.ll @@ -14,16 +14,16 @@ define void @f1() { !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!11, !13} -!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} +!0 = metadata !{metadata !"0x11\004\00 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] !1 = metadata !{metadata !"", metadata !""} !2 = metadata !{} !3 = metadata !{metadata !4} -!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2} -!5 = metadata !{i32 786473, metadata !1} -!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} +!4 = metadata !{metadata !"0x2e\00\00\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* null, null, null, metadata !2} ; [ DW_TAG_subprogram ] +!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] +!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] !7 = metadata !{metadata !8} -!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} +!8 = metadata !{metadata !"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] !9 = metadata !{metadata !10} -!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null} +!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ] !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3} -!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll index ec35d29..0610f0b 100644 --- a/test/CodeGen/X86/object-size.ll +++ b/test/CodeGen/X86/object-size.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s -check-prefix=X64 +; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s ; ModuleID = 'ts.c' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @@ -12,8 +12,8 @@ entry: %tmp = load i8** @p ; <i8*> [#uses=1] %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1] %cmp = icmp ne i64 %0, -1 ; <i1> [#uses=1] -; X64: movabsq $-1, [[RAX:%r..]] -; X64: cmpq $-1, [[RAX]] +; CHECK: movq $-1, [[RAX:%r..]] +; CHECK: cmpq $-1, [[RAX]] br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll index 349ce7d..e30cb48 100644 --- a/test/CodeGen/X86/osx-private-labels.ll +++ b/test/CodeGen/X86/osx-private-labels.ll @@ -69,3 +69,20 @@ ; CHECK: .section __DATA,__foobar,interposing ; CHECK-NEXT: .align 3 ; CHECK-NEXT: L_private12: + +@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip" +; CHECK: .section __DATA,__objc_classlist,regular,no_dead_strip +; CHECK-NEXT: .align 2 +; CHECK-NEXT: L_private13: + +@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals" +; CHECK: .section __TEXT,__objc_classname,cstring_literals +; CHECK-NEXT: L_private14: + +@private15 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methname,cstring_literals" +; CHECK: .section __TEXT,__objc_methname,cstring_literals +; CHECK-NEXT: L_private15: + +@private16 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methtype,cstring_literals" +; CHECK: .section __TEXT,__objc_methtype,cstring_literals +; CHECK-NEXT: L_private16: diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll index ec6564d..3efcc2e 100644 --- a/test/CodeGen/X86/palignr.ll +++ b/test/CodeGen/X86/palignr.ll @@ -3,58 +3,127 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test1: -; CHECK: pshufd -; CHECK-YONAH: pshufd +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test1: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 > ret <4 x i32> %C } define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test2: -; CHECK: palignr -; CHECK-YONAH: shufps +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test2: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 > ret <4 x i32> %C } define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test3: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test3: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 > ret <4 x i32> %C } define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test4: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test4: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 > ret <4 x i32> %C } define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: test5: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test5: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 > ret <4 x float> %C } define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test6: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test6: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 > ret <8 x i16> %C } define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test7: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test7: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 > ret <8 x i16> %C } define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind { ; CHECK-LABEL: test8: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test8: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 > ret <16 x i8> %C } @@ -65,8 +134,19 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind { ; was an UNDEF.) define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test9: -; CHECK-NOT: palignr -; CHECK: pshufb +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test9: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; CHECK-YONAH-NEXT: por %xmm0, %xmm1 +; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 > ret <8 x i16> %C } diff --git a/test/CodeGen/X86/patchpoint-invoke.ll b/test/CodeGen/X86/patchpoint-invoke.ll new file mode 100644 index 0000000..192cacc --- /dev/null +++ b/test/CodeGen/X86/patchpoint-invoke.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=corei7 < %s | FileCheck %s + +; Test invoking of patchpoints +; +define i64 @patchpoint_invoke(i64 %p1, i64 %p2) { +entry: +; CHECK-LABEL: patchpoint_invoke: +; CHECK-NEXT: .cfi_startproc +; CHECK: [[FUNC_BEGIN:.L.*]]: +; CHECK: .cfi_lsda 3, [[EXCEPTION_LABEL:.L[^ ]*]] +; CHECK: pushq %rbp + +; Unfortunately, hardcode the name of the label that begins the patchpoint: +; CHECK: .Ltmp0: +; CHECK: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; CHECK-NEXT: xchgw %ax, %ax +; CHECK-NEXT: [[PP_END:.L.*]]: +; CHECK: ret + %resolveCall = inttoptr i64 -559038736 to i8* + %result = invoke i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall, i32 1, i64 %p1, i64 %p2) + to label %success unwind label %threw + +success: + ret i64 %result + +threw: + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* null + ret i64 0 +} + +; Verify that the exception table was emitted: +; CHECK: [[EXCEPTION_LABEL]]: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 21 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 13 +; Verify that the unwind data covers the entire patchpoint region: +; CHECK-NEXT: .long .Ltmp0-[[FUNC_BEGIN]] +; CHECK-NEXT: .long [[PP_END]]-.Ltmp0 + + +; Verify that the stackmap section got emitted: +; CHECK-LABEL: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 0 +; Num Functions +; CHECK-NEXT: .long 1 +; Num LargeConstants +; CHECK-NEXT: .long 0 +; Num Callsites +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .quad patchpoint_invoke + + +declare void @llvm.experimental.stackmap(i64, i32, ...) +declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) +declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) +declare i32 @__gxx_personality_v0(...) diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll new file mode 100644 index 0000000..5e76bf8 --- /dev/null +++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST + +; Test the webkit_jscc calling convention. +; One argument will be passed in register, the other will be pushed on the stack. +; Return value in $rax. +define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { +entry: +; CHECK-LABEL: jscall_patchpoint_codegen: +; CHECK: Ltmp +; CHECK: movq %r{{.+}}, (%rsp) +; CHECK: movq %r{{.+}}, %rax +; CHECK: Ltmp +; CHECK-NEXT: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; CHECK: movq %rax, (%rsp) +; CHECK: callq +; FAST-LABEL: jscall_patchpoint_codegen: +; FAST: Ltmp +; FAST: movq %r{{.+}}, (%rsp) +; FAST: movq %r{{.+}}, %rax +; FAST: Ltmp +; FAST-NEXT: movabsq $-559038736, %r11 +; FAST-NEXT: callq *%r11 +; FAST: movq %rax, (%rsp) +; FAST: callq + %resolveCall2 = inttoptr i64 -559038736 to i8* + %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2) + %resolveCall3 = inttoptr i64 -559038737 to i8* + tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result) + ret void +} + +; Test if the arguments are properly aligned and that we don't store undef arguments. +define i64 @jscall_patchpoint_codegen2(i64 %callee) { +entry: +; CHECK-LABEL: jscall_patchpoint_codegen2: +; CHECK: Ltmp +; CHECK: movq $6, 24(%rsp) +; CHECK-NEXT: movl $4, 16(%rsp) +; CHECK-NEXT: movq $2, (%rsp) +; CHECK: Ltmp +; CHECK-NEXT: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; FAST-LABEL: jscall_patchpoint_codegen2: +; FAST: Ltmp +; FAST: movq $2, (%rsp) +; FAST-NEXT: movl $4, 16(%rsp) +; FAST-NEXT: movq $6, 24(%rsp) +; FAST: Ltmp +; FAST-NEXT: movabsq $-559038736, %r11 +; FAST-NEXT: callq *%r11 + %call = inttoptr i64 -559038736 to i8* + %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6) + ret i64 %result +} + +; Test if the arguments are properly aligned and that we don't store undef arguments. +define i64 @jscall_patchpoint_codegen3(i64 %callee) { +entry: +; CHECK-LABEL: jscall_patchpoint_codegen3: +; CHECK: Ltmp +; CHECK: movq $10, 48(%rsp) +; CHECK-NEXT: movl $8, 36(%rsp) +; CHECK-NEXT: movq $6, 24(%rsp) +; CHECK-NEXT: movl $4, 16(%rsp) +; CHECK-NEXT: movq $2, (%rsp) +; CHECK: Ltmp +; CHECK-NEXT: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; FAST-LABEL: jscall_patchpoint_codegen3: +; FAST: Ltmp +; FAST: movq $2, (%rsp) +; FAST-NEXT: movl $4, 16(%rsp) +; FAST-NEXT: movq $6, 24(%rsp) +; FAST-NEXT: movl $8, 36(%rsp) +; FAST-NEXT: movq $10, 48(%rsp) +; FAST: Ltmp +; FAST-NEXT: movabsq $-559038736, %r11 +; FAST-NEXT: callq *%r11 + %call = inttoptr i64 -559038736 to i8* + %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10) + ret i64 %result +} + +declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) +declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) + diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll index 62b1273..07148f0 100644 --- a/test/CodeGen/X86/patchpoint.ll +++ b/test/CodeGen/X86/patchpoint.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s ; Trivial patchpoint codegen ; @@ -38,61 +39,6 @@ entry: ret void } -; Test the webkit_jscc calling convention. -; One argument will be passed in register, the other will be pushed on the stack. -; Return value in $rax. -define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { -entry: -; CHECK-LABEL: jscall_patchpoint_codegen: -; CHECK: Ltmp -; CHECK: movq %r{{.+}}, (%rsp) -; CHECK: movq %r{{.+}}, %rax -; CHECK: Ltmp -; CHECK-NEXT: movabsq $-559038736, %r11 -; CHECK-NEXT: callq *%r11 -; CHECK: movq %rax, (%rsp) -; CHECK: callq - %resolveCall2 = inttoptr i64 -559038736 to i8* - %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2) - %resolveCall3 = inttoptr i64 -559038737 to i8* - tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result) - ret void -} - -; Test if the arguments are properly aligned and that we don't store undef arguments. -define i64 @jscall_patchpoint_codegen2(i64 %callee) { -entry: -; CHECK-LABEL: jscall_patchpoint_codegen2: -; CHECK: Ltmp -; CHECK: movq $6, 24(%rsp) -; CHECK-NEXT: movl $4, 16(%rsp) -; CHECK-NEXT: movq $2, (%rsp) -; CHECK: Ltmp -; CHECK-NEXT: movabsq $-559038736, %r11 -; CHECK-NEXT: callq *%r11 - %call = inttoptr i64 -559038736 to i8* - %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6) - ret i64 %result -} - -; Test if the arguments are properly aligned and that we don't store undef arguments. -define i64 @jscall_patchpoint_codegen3(i64 %callee) { -entry: -; CHECK-LABEL: jscall_patchpoint_codegen3: -; CHECK: Ltmp -; CHECK: movq $10, 48(%rsp) -; CHECK-NEXT: movl $8, 36(%rsp) -; CHECK-NEXT: movq $6, 24(%rsp) -; CHECK-NEXT: movl $4, 16(%rsp) -; CHECK-NEXT: movq $2, (%rsp) -; CHECK: Ltmp -; CHECK-NEXT: movabsq $-559038736, %r11 -; CHECK-NEXT: callq *%r11 - %call = inttoptr i64 -559038736 to i8* - %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10) - ret i64 %result -} - ; Test patchpoints reusing the same TargetConstant. ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4) ; There is no way to verify this, since it depends on memory allocation. @@ -125,6 +71,17 @@ entry: ret void } +; Test large target address. +define i64 @large_target_address_patchpoint_codegen() { +entry: +; CHECK-LABEL: large_target_address_patchpoint_codegen: +; CHECK: movabsq $6153737369414576827, %r11 +; CHECK-NEXT: callq *%r11 + %resolveCall2 = inttoptr i64 6153737369414576827 to i8* + %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0) + ret i64 %result +} + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll deleted file mode 100644 index f73ebb9..0000000 --- a/test/CodeGen/X86/peep-vector-extract-concat.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s -; CHECK: pshufd $3, %xmm0, %xmm0 - -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64 -; %a is passed indirectly on Win64. -; WIN64: movss 12(%rcx), %xmm0 - -define float @foo(<8 x float> %a) nounwind { - %c = extractelement <8 x float> %a, i32 3 - ret float %c -} diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll deleted file mode 100644 index f958b6b..0000000 --- a/test/CodeGen/X86/peep-vector-extract-insert.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc < %s -march=x86-64 | grep "xorps %xmm0, %xmm0" | count 2 - -define float @foo(<4 x float> %a) { - %b = insertelement <4 x float> %a, float 0.0, i32 3 - %c = extractelement <4 x float> %b, i32 3 - ret float %c -} -define float @bar(float %a) { - %b = insertelement <4 x float> <float 0x400B333340000000, float 4.5, float 0.0, float 0x4022666660000000>, float %a, i32 3 - %c = extractelement <4 x float> %b, i32 2 - ret float %c -} diff --git a/test/CodeGen/X86/peephole-fold-movsd.ll b/test/CodeGen/X86/peephole-fold-movsd.ll new file mode 100644 index 0000000..09d9328 --- /dev/null +++ b/test/CodeGen/X86/peephole-fold-movsd.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s +; +; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into +; addpd. +; rdar://problem/18236850 + +%struct.S1 = type { double, double } + +@g = common global %struct.S1 zeroinitializer, align 8 + +declare void @foo3(%struct.S1*) + +; CHECK: movsd {{[0-9]*}}(%rsp), [[R0:%xmm[0-9]+]] +; CHECK: addpd [[R0]], %xmm{{[0-9]+}} + +define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) { + %1 = alloca <2 x double>, align 16 + %tmpcast = bitcast <2 x double>* %1 to %struct.S1* + call void @foo3(%struct.S1* %tmpcast) #2 + %p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0 + %2 = load double* %p2, align 16 + %p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1 + %3 = load double* %p3, align 8 + %4 = insertelement <2 x double> undef, double %2, i32 0 + %5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1 + %6 = insertelement <2 x double> undef, double %3, i32 1 + %7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0 + %8 = fadd <2 x double> %5, %7 + store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16 + ret void +} diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 7bf8a61..8937d6a 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1,32 +1,96 @@ -; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t -; RUN: grep pmul %t | count 12 -; RUN: grep mov %t | count 14 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41 define <4 x i32> @a(<4 x i32> %i) nounwind { - %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > - ret <4 x i32> %A +; SSE2-LABEL: a: +; SSE2: movdqa {{.*}}, %[[X1:xmm[0-9]+]] +; SSE2-NEXT: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %[[X1]], %xmm0 +; SSE2-NEXT: pmuludq %[[X1]], %[[X2]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: a: +; SSE41: pmulld +; SSE41-NEXT: retq +entry: + %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > + ret <4 x i32> %A } + define <2 x i64> @b(<2 x i64> %i) nounwind { - %A = mul <2 x i64> %i, < i64 117, i64 117 > - ret <2 x i64> %A +; ALL-LABEL: b: +; ALL: pmuludq +; ALL: pmuludq +; ALL: pmuludq +entry: + %A = mul <2 x i64> %i, < i64 117, i64 117 > + ret <2 x i64> %A } + define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind { - %A = mul <4 x i32> %i, %j - ret <4 x i32> %A +; SSE2-LABEL: c: +; SSE2: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %[[X2]], %xmm1 +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: c: +; SSE41: pmulld +; SSE41-NEXT: retq +entry: + %A = mul <4 x i32> %i, %j + ret <4 x i32> %A } + define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind { - %A = mul <2 x i64> %i, %j - ret <2 x i64> %A +; ALL-LABEL: d: +; ALL: pmuludq +; ALL: pmuludq +; ALL: pmuludq +entry: + %A = mul <2 x i64> %i, %j + ret <2 x i64> %A } -; Use a call to force spills. + declare void @foo() + define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind { - call void @foo() - %A = mul <4 x i32> %i, %j - ret <4 x i32> %A +; SSE2-LABEL: e: +; SSE2: movdqa {{[0-9]*}}(%rsp), %xmm0 +; SSE2-NEXT: pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]] +; SSE2-NEXT: pmuludq %[[X2]], %xmm0 +; SSE2-NEXT: pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3] +; SSE2-NEXT: pmuludq %[[X1]], %[[X2]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: addq ${{[0-9]+}}, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: e: +; SSE41: pmulld {{[0-9]+}}(%rsp), %xmm +; SSE41-NEXT: addq ${{[0-9]+}}, %rsp +; SSE41-NEXT: retq +entry: + ; Use a call to force spills. + call void @foo() + %A = mul <4 x i32> %i, %j + ret <4 x i32> %A } + define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind { - call void @foo() - %A = mul <2 x i64> %i, %j - ret <2 x i64> %A +; ALL-LABEL: f: +; ALL: pmuludq +; ALL: pmuludq +; ALL: pmuludq +entry: + ; Use a call to force spills. + call void @foo() + %A = mul <2 x i64> %i, %j + ret <2 x i64> %A } diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll index e7e29e0..0bdb0ec 100644 --- a/test/CodeGen/X86/pr11334.ll +++ b/test/CodeGen/X86/pr11334.ll @@ -15,7 +15,7 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind { entry: ; CHECK: v3f2d_ext_vec ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v3f2d_ext_vec ; AVX: vcvtps2pd @@ -28,7 +28,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind { entry: ; CHECK: v4f2d_ext_vec ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v4f2d_ext_vec ; AVX: vcvtps2pd @@ -42,9 +42,9 @@ entry: ; CHECK: v8f2d_ext_vec ; CHECK: cvtps2pd ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v8f2d_ext_vec ; AVX: vcvtps2pd diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll deleted file mode 100644 index 024b163..0000000 --- a/test/CodeGen/X86/pr12359.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s -define <16 x i8> @shuf(<16 x i8> %inval1) { -entry: - %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4> - ret <16 x i8> %0 -; CHECK: shuf -; CHECK: # BB#0: # %entry -; CHECK-NEXT: pshufb -; CHECK-NEXT: ret -} diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll index ff4532e..c2bb8d3 100644 --- a/test/CodeGen/X86/pr14161.ll +++ b/test/CodeGen/X86/pr14161.ll @@ -3,6 +3,12 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) define <2 x i16> @good(<4 x i32>*, <4 x i8>*) { +; CHECK-LABEL: good: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movdqa (%rdi), %xmm0 +; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pmovzxwq %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %2 = load <4 x i32>* %0, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>) @@ -13,13 +19,17 @@ entry: %8 = bitcast i32 %4 to <2 x i16> %9 = bitcast i32 %5 to <2 x i16> ret <2 x i16> %8 -; CHECK: good -; CHECK: pminud -; CHECK-NEXT: pmovzxwq -; CHECK: ret } define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) { +; CHECK-LABEL: bad: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movdqa (%rdi), %xmm0 +; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pextrd $1, %xmm0, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: pmovzxwq %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %2 = load <4 x i32>* %0, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>) @@ -30,9 +40,4 @@ entry: %8 = bitcast i32 %4 to <2 x i16> %9 = bitcast i32 %5 to <2 x i16> ret <2 x i16> %9 -; CHECK: bad -; CHECK: pminud -; CHECK: pextrd -; CHECK: pmovzxwq -; CHECK: ret } diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll index c8aaf32..b4dc5fd 100644 --- a/test/CodeGen/X86/pr15267.ll +++ b/test/CodeGen/X86/pr15267.ll @@ -48,19 +48,22 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind { ; CHECK: test3 ; CHECK: movzbl -; CHECK: shrl -; CHECK: andl $1 -; CHECK: andl $1 -; CHECK: vmovd -; CHECK: pinsrd $1 -; CHECK: shrl $2 -; CHECK: andl $1 -; CHECK: pinsrd $2 -; CHECK: shrl $3 -; CHECK: andl $1 -; CHECK: pinsrd $3 -; CHECK: pslld -; CHECK: psrad -; CHECK: pmovsxdq -; CHECK: pmovsxdq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: vpunpcklqdq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: vpunpcklqdq +; CHECK: vinsertf128 ; CHECK: ret diff --git a/test/CodeGen/X86/pr18846.ll b/test/CodeGen/X86/pr18846.ll new file mode 100644 index 0000000..27801be --- /dev/null +++ b/test/CodeGen/X86/pr18846.ll @@ -0,0 +1,139 @@ +; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; pr18846 - needless avx spill/reload +; Test for unnecessary repeated spills due to eliminateRedundantSpills failing +; to recognise unaligned ymm load/stores to the stack. +; Bugpoint reduced testcase. + +;CHECK-LABEL: _Z16opt_kernel_cachePfS_S_ +;CHECK-NOT: vmovups {{.*#+}} 32-byte Folded Spill +;CHECK-NOT: vmovups {{.*#+}} 32-byte Folded Reload + +; Function Attrs: uwtable +define void @_Z16opt_kernel_cachePfS_S_() #0 { +entry: + br label %for.body29 + +for.body29: ; preds = %for.body29, %entry + br i1 undef, label %for.body29, label %for.body65 + +for.body65: ; preds = %for.body29 + %0 = load float* undef, align 4, !tbaa !1 + %vecinit7.i4448 = insertelement <8 x float> undef, float %0, i32 7 + %1 = load float* null, align 4, !tbaa !1 + %vecinit7.i4304 = insertelement <8 x float> undef, float %1, i32 7 + %2 = load float* undef, align 4, !tbaa !1 + %vecinit7.i4196 = insertelement <8 x float> undef, float %2, i32 7 + %3 = or i64 0, 16 + %add.ptr111.sum4096 = add i64 %3, 0 + %4 = load <8 x float>* null, align 16, !tbaa !5 + %add.ptr162 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr111.sum4096 + %__v.i4158 = bitcast float* %add.ptr162 to <8 x float>* + %5 = load <8 x float>* %__v.i4158, align 16, !tbaa !5 + %add.ptr158.sum40975066 = or i64 %add.ptr111.sum4096, 8 + %add.ptr183 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr158.sum40975066 + %__v.i4162 = bitcast float* %add.ptr183 to <8 x float>* + %6 = load <8 x float>* %__v.i4162, align 16, !tbaa !5 + %add.ptr200.sum40995067 = or i64 undef, 8 + %add.ptr225 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr200.sum40995067 + %__v.i4167 = bitcast float* %add.ptr225 to <8 x float>* + %7 = load <8 x float>* %__v.i4167, align 4, !tbaa !5 + %8 = load <8 x float>* undef, align 16, !tbaa !5 + %add.ptr242.sum41015068 = or i64 0, 8 + %add.ptr267 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr242.sum41015068 + %__v.i4171 = bitcast float* %add.ptr267 to <8 x float>* + %9 = load <8 x float>* %__v.i4171, align 4, !tbaa !5 + %mul.i4690 = fmul <8 x float> %7, undef + %add.i4665 = fadd <8 x float> undef, undef + %mul.i4616 = fmul <8 x float> %8, undef + %mul.i4598 = fmul <8 x float> undef, undef + %add.i4597 = fadd <8 x float> undef, %mul.i4598 + %mul.i4594 = fmul <8 x float> %6, undef + %add.i4593 = fadd <8 x float> undef, %mul.i4594 + %mul.i4578 = fmul <8 x float> %9, undef + %add.i4577 = fadd <8 x float> %add.i4593, %mul.i4578 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4577) #1 + %10 = load <8 x float>* null, align 16, !tbaa !5 + %11 = load <8 x float>* undef, align 16, !tbaa !5 + %mul.i4564 = fmul <8 x float> %4, undef + %add.i4563 = fadd <8 x float> %10, %mul.i4564 + %mul.i4560 = fmul <8 x float> %5, undef + %add.i4559 = fadd <8 x float> %11, %mul.i4560 + %add.i4547 = fadd <8 x float> %add.i4563, undef + %mul.i4546 = fmul <8 x float> %7, undef + %add.i4545 = fadd <8 x float> undef, %mul.i4546 + %mul.i4544 = fmul <8 x float> %8, undef + %add.i4543 = fadd <8 x float> %add.i4559, %mul.i4544 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4547) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4545) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4543) #1 + %add.i4455 = fadd <8 x float> undef, undef + %mul.i4454 = fmul <8 x float> undef, undef + %add.i4453 = fadd <8 x float> undef, %mul.i4454 + %mul.i4440 = fmul <8 x float> zeroinitializer, %vecinit7.i4448 + %add.i4439 = fadd <8 x float> %add.i4455, %mul.i4440 + %mul.i4438 = fmul <8 x float> %7, %vecinit7.i4448 + %add.i4437 = fadd <8 x float> %add.i4453, %mul.i4438 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4439) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4437) #1 + %add.i4413 = fadd <8 x float> zeroinitializer, undef + %mul.i4400 = fmul <8 x float> %8, undef + %add.i4399 = fadd <8 x float> undef, %mul.i4400 + %add.i4397 = fadd <8 x float> %add.i4413, zeroinitializer + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4399) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4397) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1 + %mul.i4330 = fmul <8 x float> %7, undef + %add.i4329 = fadd <8 x float> undef, %mul.i4330 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4329) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1 + %mul.i4312 = fmul <8 x float> %4, undef + %add.i4311 = fadd <8 x float> undef, %mul.i4312 + %mul.i4306 = fmul <8 x float> %6, undef + %add.i4305 = fadd <8 x float> undef, %mul.i4306 + %add.i4295 = fadd <8 x float> %add.i4311, undef + %mul.i4294 = fmul <8 x float> %7, %vecinit7.i4304 + %add.i4293 = fadd <8 x float> undef, %mul.i4294 + %mul.i4292 = fmul <8 x float> %8, %vecinit7.i4304 + %add.i4291 = fadd <8 x float> undef, %mul.i4292 + %mul.i4290 = fmul <8 x float> %9, %vecinit7.i4304 + %add.i4289 = fadd <8 x float> %add.i4305, %mul.i4290 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4295) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4293) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4291) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4289) #1 + %12 = load <8 x float>* undef, align 16, !tbaa !5 + %mul.i4274 = fmul <8 x float> undef, undef + %add.i4273 = fadd <8 x float> %12, %mul.i4274 + %mul.i4258 = fmul <8 x float> %7, undef + %add.i4257 = fadd <8 x float> %add.i4273, %mul.i4258 + %mul.i4254 = fmul <8 x float> %9, undef + %add.i4253 = fadd <8 x float> undef, %mul.i4254 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4257) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4253) #1 + %mul.i = fmul <8 x float> %9, %vecinit7.i4196 + %add.i = fadd <8 x float> undef, %mul.i + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i) #1 + unreachable +} + +; Function Attrs: nounwind +declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) #1 + +attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.5 "} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"float", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !3, metadata !3, i64 0} diff --git a/test/CodeGen/X86/pr21099.ll b/test/CodeGen/X86/pr21099.ll new file mode 100644 index 0000000..07292c1 --- /dev/null +++ b/test/CodeGen/X86/pr21099.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -O2 -march=x86-64 -verify-machineinstrs | FileCheck %s + +define void @pr21099(i64* %p) { +; CHECK-LABEL: pr21099 +; CHECK: lock +; CHECK-NEXT: addq $-2147483648 +; This number is INT32_MIN: 0x80000000UL + %1 = atomicrmw add i64* %p, i64 -2147483648 seq_cst + ret void +} diff --git a/test/CodeGen/X86/pr21529.ll b/test/CodeGen/X86/pr21529.ll new file mode 100644 index 0000000..655bc84 --- /dev/null +++ b/test/CodeGen/X86/pr21529.ll @@ -0,0 +1,15 @@ +; RUN: llc -show-mc-encoding < %s | FileCheck %s + +; Test that the direct object emission selects the and variant with 8 bit +; immediate. +; We used to get this wrong when using direct object emission, but not when +; reading assembly. + +; CHECK: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0] + +target triple = "x86_64-pc-linux" + +define void @f() { + %foo = alloca i8, align 32 + ret void +} diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll new file mode 100644 index 0000000..7fc9890 --- /dev/null +++ b/test/CodeGen/X86/pshufb-mask-comments.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s + +; Test that the pshufb mask comment is correct. + +define <16 x i8> @test1(<16 x i8> %V) { +; CHECK-LABEL: test1: +; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4] + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>) + ret <16 x i8> %1 +} + +; Test that indexes larger than the size of the vector are shown masked (bottom 4 bits). + +define <16 x i8> @test2(<16 x i8> %V) { +; CHECK-LABEL: test2: +; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2] + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>) + ret <16 x i8> %1 +} + +; Test that indexes with bit seven set are shown as zero. + +define <16 x i8> @test3(<16 x i8> %V) { +; CHECK-LABEL: test3: +; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4] + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>) + ret <16 x i8> %1 +} + +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll index d8e4572..49d58f4 100644 --- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll +++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll @@ -2,10 +2,12 @@ ; Without the last chance recoloring, this test fails with: ; "ran out of registers". -; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH +; NOTE: With the fix to PR18883, we don't actually run out of registers here +; any more, and so those checks are disabled. This test remains only for general coverage. +; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH ; Test whether failure due to cutoff for depth is reported -; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF +; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF ; Test whether failure due to cutoff for interference is reported ; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1 diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll new file mode 100644 index 0000000..83b86ac --- /dev/null +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -0,0 +1,109 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE + +; If the target's divss/divps instructions are substantially +; slower than rcpss/rcpps with a Newton-Raphson refinement, +; we should generate the estimate sequence. + +; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) +; for details about the accuracy, speed, and implementation +; differences of x86 reciprocal estimates. + +define float @reciprocal_estimate(float %x) #0 { + %div = fdiv fast float 1.0, %x + ret float %div + +; CHECK-LABEL: reciprocal_estimate: +; CHECK: movss +; CHECK-NEXT: divss +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate: +; BTVER2: vrcpss +; BTVER2: vmulss +; BTVER2: vsubss +; BTVER2: vmulss +; BTVER2: vaddss +; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate: +; REFINE: vrcpss +; REFINE: vmulss +; REFINE: vsubss +; REFINE: vmulss +; REFINE: vaddss +; REFINE: vmulss +; REFINE: vsubss +; REFINE: vmulss +; REFINE: vaddss +; REFINE-NEXT: retq +} + +define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_estimate_v4f32: +; CHECK: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate_v4f32: +; BTVER2: vrcpps +; BTVER2: vmulps +; BTVER2: vsubps +; BTVER2: vmulps +; BTVER2: vaddps +; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate_v4f32: +; REFINE: vrcpps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE-NEXT: retq +} + +define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x + ret <8 x float> %div + +; CHECK-LABEL: reciprocal_estimate_v8f32: +; CHECK: movaps +; CHECK: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: divps +; CHECK-NEXT: movaps +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate_v8f32: +; BTVER2: vrcpps +; BTVER2: vmulps +; BTVER2: vsubps +; BTVER2: vmulps +; BTVER2: vaddps +; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate_v8f32: +; REFINE: vrcpps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE-NEXT: retq +} + +attributes #0 = { "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/return_zeroext_i2.ll b/test/CodeGen/X86/return_zeroext_i2.ll new file mode 100644 index 0000000..d535b0c --- /dev/null +++ b/test/CodeGen/X86/return_zeroext_i2.ll @@ -0,0 +1,7 @@ +; RUN: llc -mtriple=i386-pc-win32 < %s | FileCheck %s +; Check that the testcase does not crash +define zeroext i2 @crash () { + ret i2 0 +} +; CHECK: xorl %eax, %eax +; CHECK-NEXT: retl diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll index b82be41..e34ba54 100644 --- a/test/CodeGen/X86/segmented-stacks-dynamic.ll +++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll @@ -1,7 +1,9 @@ ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj ; Just to prevent the alloca from being optimized away declare void @dummy_use(i32*, i32) @@ -61,6 +63,26 @@ false: ; X64-NEXT: callq __morestack_allocate_stack_space ; X64: movq %rax, %rdi +; X32ABI-LABEL: test_basic: + +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB0_2 + +; X32ABI: movl $24, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + +; X32ABI: movl %esp, %[[EDI:edi|eax]] +; X32ABI: subl %{{.*}}, %[[EDI]] +; X32ABI-NEXT: cmpl %[[EDI]], %fs:64 + +; X32ABI: movl %[[EDI]], %esp + +; X32ABI: movl %{{.*}}, %edi +; X32ABI-NEXT: callq __morestack_allocate_stack_space +; X32ABI: movl %eax, %edi + } attributes #0 = { "split-stack" } diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll index 9dab3cd..2db7c11 100644 --- a/test/CodeGen/X86/segmented-stacks.ll +++ b/test/CodeGen/X86/segmented-stacks.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW @@ -9,6 +10,7 @@ ; We used to crash with filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj @@ -51,6 +53,16 @@ define void @test_basic() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI-LABEL: test_basic: + +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB0_2 + +; X32ABI: movl $40, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin-LABEL: test_basic: ; X32-Darwin: movl $432, %ecx @@ -129,6 +141,16 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 { ; X64-Linux-NEXT: ret ; X64-Linux-NEXT: movq %rax, %r10 +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB1_2 + +; X32ABI: movl %r10d, %eax +; X32ABI-NEXT: movl $56, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret +; X32ABI-NEXT: movq %rax, %r10 + ; X32-Darwin: movl $432, %edx ; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp ; X32-Darwin-NEXT: ja LBB1_2 @@ -202,6 +224,15 @@ define void @test_large() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI: leal -40008(%rsp), %r11d +; X32ABI-NEXT: cmpl %fs:64, %r11d +; X32ABI-NEXT: ja .LBB2_2 + +; X32ABI: movl $40008, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin: leal -40012(%esp), %ecx ; X32-Darwin-NEXT: movl $432, %eax ; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx @@ -276,6 +307,16 @@ define fastcc void @test_fastcc() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI-LABEL: test_fastcc: + +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB3_2 + +; X32ABI: movl $40, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin-LABEL: test_fastcc: ; X32-Darwin: movl $432, %eax @@ -356,6 +397,17 @@ define fastcc void @test_fastcc_large() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI-LABEL: test_fastcc_large: + +; X32ABI: leal -40008(%rsp), %r11d +; X32ABI-NEXT: cmpl %fs:64, %r11d +; X32ABI-NEXT: ja .LBB4_2 + +; X32ABI: movl $40008, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin-LABEL: test_fastcc_large: ; X32-Darwin: leal -40012(%esp), %eax @@ -446,6 +498,9 @@ define void @test_nostack() #0 { ; X64-Linux-LABEL: test_nostack: ; X32-Linux-NOT: callq __morestack +; X32ABI-LABEL: test_nostack: +; X32ABI-NOT: callq __morestack + ; X32-Darwin-LABEL: test_nostack: ; X32-Darwin-NOT: calll __morestack diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index cdd258d..7e6f153 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -357,3 +357,47 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind { ; ATOM: cmpl $15, %edi ; ATOM: cmovgel %edx } + +; CHECK-LABEL: @trunc_select_miscompile +; CHECK-NOT: sarb +define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) { + %tmp1 = select i1 %cc, i32 3, i32 2 + %tmp2 = shl i32 %a, %tmp1 + ret i32 %tmp2 +} + +define void @test19() { +; This is a massive reduction of an llvm-stress test case that generates +; interesting chains feeding setcc and eventually a f32 select operation. This +; is intended to exercise the SELECT formation in the DAG combine simplifying +; a simplified select_cc node. If it it regresses and is no longer triggering +; that code path, it can be deleted. +; +; CHECK-LABEL: @test19 +; CHECK: testb +; CHECK: cmpl +; CHECK: ucomiss + +BB: + br label %CF + +CF: + %Cmp10 = icmp ule i8 undef, undef + br i1 %Cmp10, label %CF, label %CF250 + +CF250: + %E12 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2 + %Cmp32 = icmp ugt i1 %Cmp10, false + br i1 %Cmp32, label %CF, label %CF242 + +CF242: + %Cmp38 = icmp uge i32 %E12, undef + %FC = uitofp i1 %Cmp38 to float + %Sl59 = select i1 %Cmp32, float %FC, float undef + %Cmp60 = fcmp ugt float undef, undef + br i1 %Cmp60, label %CF242, label %CF244 + +CF244: + %B122 = fadd float %Sl59, undef + ret void +} diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll index 64de0ae..1a575db 100644 --- a/test/CodeGen/X86/sext-i1.ll +++ b/test/CodeGen/X86/sext-i1.ll @@ -61,3 +61,36 @@ if.end: ; preds = %if.then, %entry %xor27 = xor i32 undef, %cond ; <i32> [#uses=0] ret i32 0 } + +define i32 @t4(i64 %x) nounwind readnone ssp { +entry: +; 32-LABEL: t4: +; 32: movl +; 32: orl +; 32: movl +; 32: je +; 32: xorl + +; 64-LABEL: t4: +; 64: cmpq $1 +; 64: sbbl + %0 = icmp eq i64 %x, 0 + %1 = sext i1 %0 to i32 + ret i32 %1 +} + +define i64 @t5(i32 %x) nounwind readnone ssp { +entry: +; 32-LABEL: t5: +; 32: cmpl $1 +; 32: sbbl +; 32: movl + +; 64-LABEL: t5: +; 64: cmpl $1 +; 64: sbbq + %0 = icmp eq i32 %x, 0 + %1 = sext i1 %0 to i64 + ret i64 %1 +} + diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll index ddad307..763da63 100644 --- a/test/CodeGen/X86/shift-parts.ll +++ b/test/CodeGen/X86/shift-parts.ll @@ -7,13 +7,13 @@ ; CHECK: shrdq -define i32 @int87(i32 %uint64p_8) nounwind { +define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind { entry: %srcval4 = load i320* bitcast (%0* @g_144 to i320*), align 8 ; <i320> [#uses=1] br label %for.cond for.cond: ; preds = %for.cond, %entry - %call3.in.in.in.v = select i1 undef, i320 192, i320 128 ; <i320> [#uses=1] + %call3.in.in.in.v = select i1 %cond, i320 192, i320 128 ; <i320> [#uses=1] %call3.in.in.in = lshr i320 %srcval4, %call3.in.in.in.v ; <i320> [#uses=1] %call3.in = trunc i320 %call3.in.in.in to i32 ; <i32> [#uses=1] %tobool = icmp eq i32 %call3.in, 0 ; <i1> [#uses=1] diff --git a/test/CodeGen/X86/shuffle-combine-crash.ll b/test/CodeGen/X86/shuffle-combine-crash.ll new file mode 100644 index 0000000..6ab7b97 --- /dev/null +++ b/test/CodeGen/X86/shuffle-combine-crash.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 + +; Verify that DAGCombiner does not crash when checking if it is +; safe to fold the shuffles in function @sample_test according to rule +; (shuffle (shuffle A, Undef, M0), Undef, M1) -> (shuffle A, Undef, M2) +; +; The DAGCombiner avoids folding shuffles if +; the resulting shuffle dag node is not legal for the target. +; That means, the shuffle must have legal type and legal mask. +; +; Before, the DAGCombiner forgot to check if the resulting shuffle +; was legal. It instead just called method +; 'X86TargetLowering::isShuffleMaskLegal'; however, that was not enough since +; that method always expect to have a valid vector type in input. +; As a consequence, compiling the function below would have caused a crash. + +define void @sample_test() { + br i1 undef, label %5, label %1 + +; <label>:1 ; preds = %0 + %2 = load <4 x i8>* undef + %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + %4 = shufflevector <4 x i8> %3, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> + store <4 x i8> %4, <4 x i8>* undef + br label %5 + +; <label>:5 ; preds = %1, %0 + ret void +} + diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll index 2dc8816..1e34a2b 100644 --- a/test/CodeGen/X86/sincos-opt.ll +++ b/test/CodeGen/X86/sincos-opt.ll @@ -15,7 +15,8 @@ entry: ; OSX_SINCOS-LABEL: test1: ; OSX_SINCOS: callq ___sincosf_stret -; OSX_SINCOS: pshufd $1, %xmm0, %xmm1 +; OSX_SINCOS: movaps %xmm0, %xmm1 +; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3] ; OSX_SINCOS: addss %xmm0, %xmm1 ; OSX_NOOPT: test1 diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll new file mode 100644 index 0000000..6e3a003 --- /dev/null +++ b/test/CodeGen/X86/sink-blockfreq.ll @@ -0,0 +1,45 @@ +; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI +; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI + +; Test that by changing BlockFrequencyInfo we change the order in which +; machine-sink looks for sucessor blocks. By not using BFI, both G and B +; have the same loop depth and no instructions is sinked - B is selected but +; can't be used as to avoid breaking a non profitable critical edge. By using +; BFI, "mul" is sinked into the less frequent block G. +define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp { +; MSINK_BFI-LABEL: sink_freqinfo +; MSINK_BFI: jl +; MSINK_BFI-NEXT: ## BB# +; MSINK_BFI-NEXT: imull + +; MSINK_NOBFI-LABEL: sink_freqinfo +; MSINK_NOBFI: imull +; MSINK_NOBFI: jl +entry: + br label %B + +B: + %ee = phi i32 [ 0, %entry ], [ %inc, %F ] + %xx = sub i32 %a, %ee + %cond0 = icmp slt i32 %xx, 0 + br i1 %cond0, label %F, label %exit, !prof !0 + +F: + %inc = add nsw i32 %xx, 2 + %aa = mul nsw i32 %b, %inc + %exitcond = icmp slt i32 %inc, %a + br i1 %exitcond, label %B, label %G, !prof !1 + +G: + %ii = add nsw i32 %aa, %a + %ll = add i32 %b, 45 + %exitcond2 = icmp sge i32 %ii, %b + br i1 %exitcond2, label %G, label %exit, !prof !2 + +exit: + ret i32 0 +} + +!0 = metadata !{metadata !"branch_weights", i32 4, i32 1} +!1 = metadata !{metadata !"branch_weights", i32 128, i32 1} +!2 = metadata !{metadata !"branch_weights", i32 1, i32 1} diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll index c600f92..6757f31 100644 --- a/test/CodeGen/X86/sink-out-of-loop.ll +++ b/test/CodeGen/X86/sink-out-of-loop.ll @@ -5,7 +5,7 @@ ; MOV32ri outside the loop. ; rdar://11980766 define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp { -; CHECK: sink_succ +; CHECK-LABEL: sink_succ ; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader ; CHECK: %exit ; CHECK-NOT: movl @@ -52,3 +52,24 @@ for.body2: for.end20: ret i32 0 } + +define i32 @sink_out_of_loop(i32 %n, i32* %output) { +; CHECK-LABEL: sink_out_of_loop: +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i2, %loop ] + %j = mul i32 %i, %i + %addr = getelementptr i32* %output, i32 %i + store i32 %i, i32* %addr + %i2 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i2, %n + br i1 %exit_cond, label %exit, label %loop + +exit: +; CHECK: BB#2 +; CHECK: imull %eax, %eax +; CHECK: retq + ret i32 %j +} diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll new file mode 100644 index 0000000..541d992 --- /dev/null +++ b/test/CodeGen/X86/slow-incdec.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s + +; check -mattr=-slow-incdec +; INCDEC-NOT: addl $-1 +; INCDEC: dec +; INCDEC-NOT: addl $1 +; INCDEC: inc + +; check -mattr=+slow-incdec +; ADD: addl $-1 +; ADD-NOT: dec +; ADD: addl $1 +; ADD-NOT: inc + +; Function Attrs: nounwind readonly +define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 { +entry: + %cmp5 = icmp eq i32 %s, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.body: ; preds = %for.body.preheader, %for.cond + %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i32 %i.06 + %0 = load i32* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %0, 0 +; + %dec = add nsw i32 %i.06, -1 + br i1 %cmp1, label %for.end.loopexit, label %for.cond + +for.end.loopexit: ; preds = %for.cond, %for.body + %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Function Attrs: nounwind readonly +define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 { +entry: + %cmp5 = icmp eq i32 %s, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %inc, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.body: ; preds = %for.body.preheader, %for.cond + %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i32 %i.06 + %0 = load i32* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %i.06, 1 + br i1 %cmp1, label %for.end.loopexit, label %for.cond + +for.end.loopexit: ; preds = %for.cond, %for.body + %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll new file mode 100644 index 0000000..c052ad2 --- /dev/null +++ b/test/CodeGen/X86/splat-for-size.ll @@ -0,0 +1,141 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2 + +; Check constant loads of every 128-bit and 256-bit vector type +; for size optimization using splat ops available with AVX and AVX2. + +; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr). +define <2 x double> @splat_v2f64(<2 x double> %x) #0 { + %add = fadd <2 x double> %x, <double 1.0, double 1.0> + ret <2 x double> %add +; CHECK-LABEL: splat_v2f64 +; CHECK: vmovddup +; CHECK: vaddpd +; CHECK-NEXT: retq +} + +define <4 x double> @splat_v4f64(<4 x double> %x) #0 { + %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0> + ret <4 x double> %add +; CHECK-LABEL: splat_v4f64 +; CHECK: vbroadcastsd +; CHECK-NEXT: vaddpd +; CHECK-NEXT: retq +} + +define <4 x float> @splat_v4f32(<4 x float> %x) #0 { + %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0> + ret <4 x float> %add +; CHECK-LABEL: splat_v4f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +define <8 x float> @splat_v8f32(<8 x float> %x) #0 { + %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> + ret <8 x float> %add +; CHECK-LABEL: splat_v8f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. +; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq. +define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 { + %add = add <2 x i64> %x, <i64 1, i64 1> + ret <2 x i64> %add +; CHECK-LABEL: splat_v2i64 +; CHECK: vmovddup +; CHECK: vpaddq +; CHECK-NEXT: retq +} + +; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors, +; and then we fake it: use vmovddup to splat 64-bit value. +define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { + %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1> + ret <4 x i64> %add +; CHECK-LABEL: splat_v4i64 +; AVX: vmovddup +; AVX: vpaddq +; AVX: vpaddq +; AVX2: vpbroadcastq +; AVX2: vpaddq +; CHECK: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 { + %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +; CHECK-LABEL: splat_v4i32 +; AVX: vbroadcastss +; AVX2: vpbroadcastd +; CHECK-NEXT: vpaddd +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { + %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + ret <8 x i32> %add +; CHECK-LABEL: splat_v8i32 +; AVX: vbroadcastss +; AVX: vpaddd +; AVX: vpaddd +; AVX2: vpbroadcastd +; AVX2: vpaddd +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 { + %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + ret <8 x i16> %add +; CHECK-LABEL: splat_v8i16 +; AVX-NOT: broadcast +; AVX2: vpbroadcastw +; CHECK: vpaddw +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { + %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + ret <16 x i16> %add +; CHECK-LABEL: splat_v16i16 +; AVX-NOT: broadcast +; AVX: vpaddw +; AVX: vpaddw +; AVX2: vpbroadcastw +; AVX2: vpaddw +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 { + %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + ret <16 x i8> %add +; CHECK-LABEL: splat_v16i8 +; AVX-NOT: broadcast +; AVX2: vpbroadcastb +; CHECK: vpaddb +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { + %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + ret <32 x i8> %add +; CHECK-LABEL: splat_v32i8 +; AVX-NOT: broadcast +; AVX: vpaddb +; AVX: vpaddb +; AVX2: vpbroadcastb +; AVX2: vpaddb +; CHECK: retq +} + +attributes #0 = { optsize } diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll deleted file mode 100644 index 4d59b9c..0000000 --- a/test/CodeGen/X86/splat-scalar-load.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s -; rdar://7434544 - -define <2 x i64> @t2() nounwind { -entry: -; CHECK-LABEL: t2: -; CHECK: pshufd $85, (%esp), %xmm0 - %array = alloca [8 x float], align 4 - %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1 - %tmp2 = load float* %arrayidx - %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0 - %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1 - %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2 - %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3 - %0 = bitcast <4 x float> %vecinit9 to <2 x i64> - ret <2 x i64> %0 -} diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll index fc79e31..24b175e 100644 --- a/test/CodeGen/X86/sqrt-fastmath.ll +++ b/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from ; #include <math.h> @@ -52,9 +53,80 @@ entry: ret x86_fp80 %call } -; Function Attrs: nounwind readnone declare x86_fp80 @__sqrtl_finite(x86_fp80) #1 +declare float @llvm.sqrt.f32(float) #1 +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1 +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1 + +; If the target's sqrtss and divss instructions are substantially +; slower than rsqrtss with a Newton-Raphson refinement, we should +; generate the estimate sequence. + +define float @reciprocal_square_root(float %x) #0 { + %sqrt = tail call float @llvm.sqrt.f32(float %x) + %div = fdiv fast float 1.0, %sqrt + ret float %div + +; CHECK-LABEL: reciprocal_square_root: +; CHECK: sqrtss +; CHECK-NEXT: movss +; CHECK-NEXT: divss +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root: +; BTVER2: vrsqrtss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vaddss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: retq +} + +define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 { + %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_square_root_v4f32: +; CHECK: sqrtps +; CHECK-NEXT: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root_v4f32: +; BTVER2: vrsqrtps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: retq +} + +define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 { + %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + ret <8 x float> %div + +; CHECK-LABEL: reciprocal_square_root_v8f32: +; CHECK: sqrtps +; CHECK-NEXT: sqrtps +; CHECK-NEXT: movaps +; CHECK-NEXT: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: divps +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root_v8f32: +; BTVER2: vrsqrtps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: retq +} + + attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind readnone } diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll index 2351fd6..396da0f 100644 --- a/test/CodeGen/X86/sse-align-12.ll +++ b/test/CodeGen/X86/sse-align-12.ll @@ -1,9 +1,11 @@ -; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s -; CHECK-LABEL: a: -; CHECK: movdqu -; CHECK: pshufd define <4 x float> @a(<4 x float>* %y) nounwind { +; CHECK-LABEL: a: +; CHECK: # BB#0: +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %x = load <4 x float>* %y, align 4 %a = extractelement <4 x float> %x, i32 0 %b = extractelement <4 x float> %x, i32 1 @@ -16,10 +18,12 @@ define <4 x float> @a(<4 x float>* %y) nounwind { ret <4 x float> %s } -; CHECK-LABEL: b: -; CHECK: movups -; CHECK: unpckhps define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind { +; CHECK-LABEL: b: +; CHECK: # BB#0: +; CHECK-NEXT: movups (%rdi), %xmm1 +; CHECK-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: retq %x = load <4 x float>* %y, align 4 %a = extractelement <4 x float> %x, i32 2 %b = extractelement <4 x float> %x, i32 3 @@ -32,10 +36,12 @@ define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind { ret <4 x float> %s } -; CHECK-LABEL: c: -; CHECK: movupd -; CHECK: shufpd define <2 x double> @c(<2 x double>* %y) nounwind { +; CHECK-LABEL: c: +; CHECK: # BB#0: +; CHECK-NEXT: movupd (%rdi), %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: retq %x = load <2 x double>* %y, align 8 %a = extractelement <2 x double> %x, i32 0 %c = extractelement <2 x double> %x, i32 1 @@ -44,10 +50,12 @@ define <2 x double> @c(<2 x double>* %y) nounwind { ret <2 x double> %r } -; CHECK-LABEL: d: -; CHECK: movupd -; CHECK: unpckhpd define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind { +; CHECK-LABEL: d: +; CHECK: # BB#0: +; CHECK-NEXT: movupd (%rdi), %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: retq %x = load <2 x double>* %y, align 8 %a = extractelement <2 x double> %x, i32 1 %c = extractelement <2 x double> %z, i32 1 diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll index 5122c44..da36a42 100644 --- a/test/CodeGen/X86/sse-minmax.ll +++ b/test/CodeGen/X86/sse-minmax.ll @@ -138,8 +138,7 @@ define double @ole_inverse(double %x, double %y) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ogt_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ogt_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -157,8 +156,7 @@ define double @ogt_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: olt_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: olt_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -177,8 +175,7 @@ define double @olt_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ogt_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ogt_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -198,8 +195,7 @@ define double @ogt_inverse_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: olt_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: olt_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -217,8 +213,7 @@ define double @olt_inverse_x(double %x) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: oge_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: oge_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -235,8 +230,7 @@ define double @oge_x(double %x) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: ole_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ole_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -253,8 +247,7 @@ define double @ole_x(double %x) nounwind { ; CHECK-NEXT: andnpd ; UNSAFE-LABEL: oge_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: oge_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -271,8 +264,7 @@ define double @oge_inverse_x(double %x) nounwind { ; CHECK: cmplesd %xmm ; UNSAFE-LABEL: ole_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ole_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -412,8 +404,7 @@ define double @ule_inverse(double %x, double %y) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: ugt_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ugt_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -430,8 +421,7 @@ define double @ugt_x(double %x) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: ult_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ult_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -448,8 +438,7 @@ define double @ult_x(double %x) nounwind { ; CHECK-NEXT: andnpd ; UNSAFE-LABEL: ugt_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ugt_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -467,8 +456,7 @@ define double @ugt_inverse_x(double %x) nounwind { ; CHECK-NEXT: andnpd ; UNSAFE-LABEL: ult_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ult_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -488,8 +476,7 @@ define double @ult_inverse_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: uge_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: uge_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -508,8 +495,7 @@ define double @uge_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ule_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ule_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -527,8 +513,7 @@ define double @ule_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: uge_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: uge_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -547,8 +532,7 @@ define double @uge_inverse_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ule_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ule_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll deleted file mode 100644 index 600ee1b..0000000 --- a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll +++ /dev/null @@ -1,423 +0,0 @@ -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s - -; Ensure that the backend selects SSE/AVX scalar fp instructions -; from a packed fp instrution plus a vector insert. - - -define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_add_ss -; SSE2: addss %xmm1, %xmm0 -; AVX: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_sub_ss -; SSE2: subss %xmm1, %xmm0 -; AVX: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_mul_ss -; SSE2: mulss %xmm1, %xmm0 -; AVX: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_div_ss -; SSE2: divss %xmm1, %xmm0 -; AVX: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_add_sd -; SSE2: addsd %xmm1, %xmm0 -; AVX: vaddsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_sub_sd -; SSE2: subsd %xmm1, %xmm0 -; AVX: vsubsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_mul_sd -; SSE2: mulsd %xmm1, %xmm0 -; AVX: vmulsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_div_sd -; SSE2: divsd %xmm1, %xmm0 -; AVX: vdivsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_add_ss -; SSE2: addss %xmm0, %xmm1 -; AVX: vaddss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_sub_ss -; SSE2: subss %xmm0, %xmm1 -; AVX: vsubss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_mul_ss -; SSE2: mulss %xmm0, %xmm1 -; AVX: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_div_ss -; SSE2: divss %xmm0, %xmm1 -; AVX: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_add_sd -; SSE2: addsd %xmm0, %xmm1 -; AVX: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_sub_sd -; SSE2: subsd %xmm0, %xmm1 -; AVX: vsubsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_mul_sd -; SSE2: mulsd %xmm0, %xmm1 -; AVX: vmulsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_div_sd -; SSE2: divsd %xmm0, %xmm1 -; AVX: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_add_ss -; SSE2: addss %xmm1, %xmm0 -; AVX: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_sub_ss -; SSE2: subss %xmm1, %xmm0 -; AVX: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_mul_ss -; SSE2: mulss %xmm1, %xmm0 -; AVX: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_div_ss -; SSE2: divss %xmm1, %xmm0 -; AVX: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_add_sd -; SSE2: addsd %xmm1, %xmm0 -; AVX: vaddsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_sub_sd -; SSE2: subsd %xmm1, %xmm0 -; AVX: vsubsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_mul_sd -; SSE2: mulsd %xmm1, %xmm0 -; AVX: vmulsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_div_sd -; SSE2: divsd %xmm1, %xmm0 -; AVX: vdivsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_add_ss -; SSE2: addss %xmm0, %xmm1 -; AVX: vaddss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_sub_ss -; SSE2: subss %xmm0, %xmm1 -; AVX: vsubss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_mul_ss -; SSE2: mulss %xmm0, %xmm1 -; AVX: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_div_ss -; SSE2: divss %xmm0, %xmm1 -; AVX: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_add_sd -; SSE2: addsd %xmm0, %xmm1 -; AVX: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_sub_sd -; SSE2: subsd %xmm0, %xmm1 -; AVX: vsubsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_mul_sd -; SSE2: mulsd %xmm0, %xmm1 -; AVX: vmulsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_div_sd -; SSE2: divsd %xmm0, %xmm1 -; AVX: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll index 3949a83..b122ef6 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1,13 +1,23 @@ -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s +; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s +; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s +; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s + +target triple = "x86_64-unknown-unknown" ; Ensure that the backend no longer emits unnecessary vector insert ; instructions immediately after SSE scalar fp instructions ; like addss or mulss. - define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %add = fadd float %2, %1 @@ -15,14 +25,16 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_add_ss -; SSE2: addss %xmm1, %xmm0 -; AVX: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %sub = fsub float %2, %1 @@ -30,13 +42,16 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_sub_ss -; SSE2: subss %xmm1, %xmm0 -; AVX: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %mul = fmul float %2, %1 @@ -44,14 +59,16 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_mul_ss -; SSE2: mulss %xmm1, %xmm0 -; AVX: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %div = fdiv float %2, %1 @@ -59,14 +76,16 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_div_ss -; SSE2: divss %xmm1, %xmm0 -; AVX: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %add = fadd double %2, %1 @@ -74,14 +93,16 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_add_sd -; SSE2: addsd %xmm1, %xmm0 -; AVX: vaddsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %sub = fsub double %2, %1 @@ -89,14 +110,16 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_sub_sd -; SSE2: subsd %xmm1, %xmm0 -; AVX: vsubsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %mul = fmul double %2, %1 @@ -104,14 +127,16 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_mul_sd -; SSE2: mulsd %xmm1, %xmm0 -; AVX: vmulsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %div = fdiv double %2, %1 @@ -119,14 +144,17 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_div_sd -; SSE2: divsd %xmm1, %xmm0 -; AVX: vdivsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %add = fadd float %1, %2 @@ -134,14 +162,17 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_add_ss -; SSE2: addss %xmm0, %xmm1 -; AVX: vaddss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %sub = fsub float %2, %1 @@ -149,14 +180,17 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_sub_ss -; SSE2: subss %xmm0, %xmm1 -; AVX: vsubss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %mul = fmul float %1, %2 @@ -164,14 +198,17 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_mul_ss -; SSE2: mulss %xmm0, %xmm1 -; AVX: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %div = fdiv float %2, %1 @@ -179,14 +216,17 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_div_ss -; SSE2: divss %xmm0, %xmm1 -; AVX: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %add = fadd double %1, %2 @@ -194,14 +234,17 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_add_sd -; SSE2: addsd %xmm0, %xmm1 -; AVX: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %sub = fsub double %2, %1 @@ -209,14 +252,17 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_sub_sd -; SSE2: subsd %xmm0, %xmm1 -; AVX: vsubsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %mul = fmul double %1, %2 @@ -224,14 +270,17 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_mul_sd -; SSE2: mulsd %xmm0, %xmm1 -; AVX: vmulsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %div = fdiv double %2, %1 @@ -239,14 +288,18 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_div_sd -; SSE2: divsd %xmm0, %xmm1 -; AVX: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %add = fadd float %2, %1 @@ -255,14 +308,19 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_add_ss -; CHECK: addss -; CHECK: addss -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: subss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %sub = fsub float %2, %1 @@ -271,14 +329,18 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_sub_ss -; CHECK: subss -; CHECK: subss -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %mul = fmul float %2, %1 @@ -287,13 +349,19 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_mul_ss -; CHECK: mulss -; CHECK: mulss -; CHECK-NOT: movss -; CHECK: ret - define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_div_ss: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: divss %xmm1, %xmm2 +; SSE-NEXT: divss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %div = fdiv float %2, %1 @@ -302,9 +370,501 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_div_ss -; CHECK: divss -; CHECK: divss -; CHECK-NOT: movss -; CHECK: ret +; Ensure that the backend selects SSE/AVX scalar fp instructions +; from a packed fp instrution plus a vector insert. + +define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} +define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll index 183297e..fd35e75 100644 --- a/test/CodeGen/X86/sse1.ll +++ b/test/CodeGen/X86/sse1.ll @@ -1,17 +1,6 @@ ; Tests for SSE1 and below, without SSE2+. -; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 | FileCheck %s -; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s - -define <8 x i16> @test1(<8 x i32> %a) nounwind { -; CHECK: test1 - ret <8 x i16> zeroinitializer -} - -define <8 x i16> @test2(<8 x i32> %a) nounwind { -; CHECK: test2 - %c = trunc <8 x i32> %a to <8 x i16> ; <<8 x i16>> [#uses=1] - ret <8 x i16> %c -} +; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mcpu=pentium3 -O3 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s ; PR7993 ;define <4 x i32> @test3(<4 x i16> %a) nounwind { @@ -23,6 +12,15 @@ define <8 x i16> @test2(<8 x i32> %a) nounwind { ; vector that this ends up returning. ; rdar://8368414 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind { +; CHECK-LABEL: test4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: subss %xmm1, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: ret entry: %tmp7 = extractelement <2 x float> %A, i32 0 %tmp5 = extractelement <2 x float> %A, i32 1 @@ -33,15 +31,6 @@ entry: %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 ret <2 x float> %tmp9 -; CHECK-LABEL: test4: -; CHECK-NOT: shufps $16 -; CHECK: shufps $1, -; CHECK-NOT: shufps $16 -; CHECK: shufps $1, -; CHECK-NOT: shufps $16 -; CHECK: unpcklps -; CHECK-NOT: shufps $16 -; CHECK: ret } ; We used to get stuck in type legalization for this example when lowering the @@ -50,8 +39,9 @@ entry: ; condition operand and widening the resulting vselect for the v4f32 result. ; PR18036 -; CHECK-LABEL: vselect define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { +; CHECK-LABEL: vselect: +; CHECK: ret entry: %a1 = icmp eq <4 x i32> %q, zeroinitializer %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll deleted file mode 100644 index c63ff72..0000000 --- a/test/CodeGen/X86/sse2-blend.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s - -; CHECK-LABEL: vsel_float -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret -define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) { - %A = load <4 x float>* %v1 - %B = load <4 x float>* %v2 - %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B - store <4 x float > %vsel, <4 x float>* %v1 - ret void -} - -; CHECK-LABEL: vsel_i32 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret -define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) { - %A = load <4 x i32>* %v1 - %B = load <4 x i32>* %v2 - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B - store <4 x i32 > %vsel, <4 x i32>* %v1 - ret void -} - -; Without forcing instructions, fall back to the preferred PS domain. -; CHECK-LABEL: vsel_i64 -; CHECK: andnps -; CHECK: orps -; CHECK: ret - -define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) { - %A = load <2 x i64>* %v1 - %B = load <2 x i64>* %v2 - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B - store <2 x i64 > %vsel, <2 x i64>* %v1 - ret void -} - -; Without forcing instructions, fall back to the preferred PS domain. -; CHECK-LABEL: vsel_double -; CHECK: andnps -; CHECK: orps -; CHECK: ret - -define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) { - %A = load <2 x double>* %v1 - %B = load <2 x double>* %v2 - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B - store <2 x double > %vsel, <2 x double>* %v1 - ret void -} - - diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index c906ecd..c4d9e6d 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -408,21 +408,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: pslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: pslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} +
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+ ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+ ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+ %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone @@ -504,21 +504,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: psrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: psrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} +
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+ ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+ ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll deleted file mode 100644 index e066368..0000000 --- a/test/CodeGen/X86/sse2-mul.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s - -define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) { - %m = mul <4 x i32> %x, %y - ret <4 x i32> %m -; CHECK-LABEL: test1: -; CHECK: pshufd $49 -; CHECK: pmuludq -; CHECK: pshufd $49 -; CHECK: pmuludq -; CHECK: shufps $-120 -; CHECK: pshufd $-40 -; CHECK: ret -} diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index e8d3d6f..b7db6cb 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -2,39 +2,48 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { +; CHECK-LABEL: test1: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movapd %xmm0, (%eax) +; CHECK-NEXT: retl %tmp3 = load <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > store <2 x double> %tmp9, <2 x double>* %r, align 16 ret void - -; CHECK-LABEL: test1: -; CHECK: movl 4(%esp), %eax -; CHECK-NEXT: movl 8(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: movlpd 12(%esp), %xmm0 -; CHECK-NEXT: movapd %xmm0, (%eax) -; CHECK-NEXT: ret } define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movapd %xmm0, (%eax) +; CHECK-NEXT: retl %tmp3 = load <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > store <2 x double> %tmp9, <2 x double>* %r, align 16 ret void - -; CHECK-LABEL: test2: -; CHECK: movl 4(%esp), %eax -; CHECK: movl 8(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: movhpd 12(%esp), %xmm0 -; CHECK-NEXT: movapd %xmm0, (%eax) -; CHECK-NEXT: ret } define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { +; CHECK-LABEL: test3: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movaps (%edx), %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl %tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2] %tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2] %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1] @@ -47,24 +56,30 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1] store <4 x float> %tmp13, <4 x float>* %res ret void -; CHECK: @test3 -; CHECK: unpcklps } define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { +; CHECK-LABEL: test4: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp5, <4 x float>* %res ret void -; CHECK: @test4 -; CHECK: pshufd $50, %xmm0, %xmm0 } define <4 x i32> @test5(i8** %ptr) nounwind { ; CHECK-LABEL: test5: -; CHECK: pxor -; CHECK: punpcklbw -; CHECK: punpcklwd - +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: movss (%eax), %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: retl %tmp = load i8** %ptr ; <i8*> [#uses=1] %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1] %tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1] @@ -81,30 +96,39 @@ define <4 x i32> @test5(i8** %ptr) nounwind { } define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { - %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1] - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp2, <4 x float>* %res - ret void - ; CHECK-LABEL: test6: -; CHECK: movaps (%ecx), %xmm0 -; CHECK: movaps %xmm0, (%eax) +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1] + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + store <4 x float> %tmp2, <4 x float>* %res + ret void } define void @test7() nounwind { - bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] - shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] - store <4 x float> %2, <4 x float>* null - ret void - ; CHECK-LABEL: test7: -; CHECK: xorps %xmm0, %xmm0 -; CHECK: movaps %xmm0, 0 +; CHECK: ## BB#0: +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: retl + bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] + shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] + store <4 x float> %2, <4 x float>* null + ret void } @x = external global [4 x i32] define <2 x i64> @test8() nounwind { +; CHECK-LABEL: test8: +; CHECK: ## BB#0: +; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax +; CHECK-NEXT: movups (%eax), %xmm0 +; CHECK-NEXT: retl %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1] %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1] %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1] @@ -115,90 +139,123 @@ define <2 x i64> @test8() nounwind { %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1] %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp16 -; CHECK-LABEL: test8: -; CHECK: movups (%eax), %xmm0 } define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { +; CHECK-LABEL: test9: +; CHECK: ## BB#0: +; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: retl %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] ret <4 x float> %tmp13 -; CHECK-LABEL: test9: -; CHECK: movups 8(%esp), %xmm0 } define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { +; CHECK-LABEL: test10: +; CHECK: ## BB#0: +; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: retl %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] ret <4 x float> %tmp13 -; CHECK-LABEL: test10: -; CHECK: movaps 4(%esp), %xmm0 } define <2 x double> @test11(double %a, double %b) nounwind { +; CHECK-LABEL: test11: +; CHECK: ## BB#0: +; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: retl %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] ret <2 x double> %tmp7 -; CHECK-LABEL: test11: -; CHECK: movaps 4(%esp), %xmm0 } define void @test12() nounwind { - %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2] - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp4, <4 x float>* null - ret void ; CHECK-LABEL: test12: -; CHECK: movhlps -; CHECK: shufps +; CHECK: ## BB#0: +; CHECK-NEXT: movapd 0, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2] + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] + store <4 x float> %tmp4, <4 x float>* null + ret void } define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { - %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1] - %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1] - %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp11, <4 x float>* %res - ret void -; CHECK: test13 -; CHECK: shufps $69, (%ecx), %xmm0 -; CHECK: pshufd $-40, %xmm0, %xmm0 +; CHECK-LABEL: test13: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movaps (%edx), %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1] + %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1] + %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] + store <4 x float> %tmp11, <4 x float>* %res + ret void } define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { - %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2] - %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2] - %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] - %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] - %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp27 ; CHECK-LABEL: test14: -; CHECK: addps [[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]] -; CHECK: subps [[X1]], [[X2:%xmm[0-9]+]] -; CHECK: movlhps [[X2]], [[X0]] +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm1 +; CHECK-NEXT: movaps (%eax), %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: subps %xmm1, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: retl + %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2] + %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2] + %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] + %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] + %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] + ret <4 x float> %tmp27 } define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { -entry: - %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1] - %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1] - %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp4 ; CHECK-LABEL: test15: -; CHECK: movhlps %xmm1, %xmm0 +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-NEXT: retl +entry: + %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1] + %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1] + %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + ret <4 x float> %tmp4 } ; PR8900 -; CHECK-LABEL: test16: -; CHECK: unpcklpd -; CHECK: ret define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { +; CHECK-LABEL: test16: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movapd 96(%eax), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: retl %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3 %i6 = load <4 x double>* %i5, align 32 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2> @@ -207,6 +264,11 @@ define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap ; PR9009 define fastcc void @test17() nounwind { +; CHECK-LABEL: test17: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768> +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3> @@ -217,25 +279,48 @@ entry: ; PR9210 define <4 x float> @f(<4 x double>) nounwind { +; CHECK-LABEL: f: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl entry: %double2float.i = fptrunc <4 x double> %0 to <4 x float> ret <4 x float> %double2float.i } define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { -; CHECK-LABEL: test_insert_64_zext -; CHECK-NOT: xor -; CHECK: movq +; CHECK-LABEL: test_insert_64_zext: +; CHECK: ## BB#0: +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: retl %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> ret <2 x i64> %1 } define <4 x i32> @PR19721(<4 x i32> %i) { +; CHECK-LABEL: PR19721: +; CHECK: ## BB#0: +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: retl %bc = bitcast <4 x i32> %i to i128 %insert = and i128 %bc, -4294967296 %bc2 = bitcast i128 %insert to <4 x i32> ret <4 x i32> %bc2 +} -; CHECK-LABEL: PR19721 -; CHECK: punpckldq +define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: test_mul: +; CHECK: ## BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: retl + %m = mul <4 x i32> %x, %y + ret <4 x i32> %m } diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll index b7706cc..5b2de28 100644 --- a/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll index 8b66743..431588f 100644 --- a/test/CodeGen/X86/sse3-avx-addsub.ll +++ b/test/CodeGen/X86/sse3-avx-addsub.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK ; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK ; Test ADDSUB ISel patterns. @@ -141,156 +141,3 @@ define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) { ; AVX: vaddsubpd ; CHECK-NEXT: ret -; Functions below are obtained from the following source: -; -; float4 test1(float4 A, float4 B) { -; float4 X = A + B; -; float4 Y = A - B; -; return (float4){X[0], Y[1], X[2], Y[3]}; -; } -; -; float8 test2(float8 A, float8 B) { -; float8 X = A + B; -; float8 Y = A - B; -; return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]}; -; } -; -; double4 test3(double4 A, double4 B) { -; double4 X = A + B; -; double4 Y = A - B; -; return (double4){X[0], Y[1], X[2], Y[3]}; -; } -; -; double2 test4(double2 A, double2 B) { -; double2 X = A + B; -; double2 Y = A - B; -; return (double2){X[0], Y[1]}; -; } - -define <4 x float> @test5(<4 x float> %A, <4 x float> %B) { - %sub = fsub <4 x float> %A, %B - %add = fadd <4 x float> %A, %B - %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %vecinit6 -} -; CHECK-LABEL: test5 -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; CHECK: ret - - -define <8 x float> @test6(<8 x float> %A, <8 x float> %B) { - %sub = fsub <8 x float> %A, %B - %add = fadd <8 x float> %A, %B - %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> - ret <8 x float> %vecinit14 -} -; CHECK-LABEL: test6 -; SSE: xorps -; SSE-NEXT: addsubps -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; AVX-NOT: vxorps -; AVX-NOT: vaddsubps -; CHECK: ret - - -define <4 x double> @test7(<4 x double> %A, <4 x double> %B) { - %sub = fsub <4 x double> %A, %B - %add = fadd <4 x double> %A, %B - %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %vecinit6 -} -; CHECK-LABEL: test7 -; SSE: xorpd -; SSE-NEXT: addsubpd -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; AVX-NOT: vxorpd -; AVX-NOT: vaddsubpd -; CHECK: ret - - -define <2 x double> @test8(<2 x double> %A, <2 x double> %B) #0 { - %add = fadd <2 x double> %A, %B - %sub = fsub <2 x double> %A, %B - %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3> - ret <2 x double> %vecinit2 -} -; CHECK-LABEL: test8 -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; CHECK: ret - - -define <4 x float> @test5b(<4 x float> %A, <4 x float> %B) { - %sub = fsub <4 x float> %A, %B - %add = fadd <4 x float> %B, %A - %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %vecinit6 -} -; CHECK-LABEL: test5 -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; CHECK: ret - - -define <8 x float> @test6b(<8 x float> %A, <8 x float> %B) { - %sub = fsub <8 x float> %A, %B - %add = fadd <8 x float> %B, %A - %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> - ret <8 x float> %vecinit14 -} -; CHECK-LABEL: test6 -; SSE: xorps -; SSE-NEXT: addsubps -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; AVX-NOT: vxorps -; AVX-NOT: vaddsubps -; CHECK: ret - - -define <4 x double> @test7b(<4 x double> %A, <4 x double> %B) { - %sub = fsub <4 x double> %A, %B - %add = fadd <4 x double> %B, %A - %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %vecinit6 -} -; CHECK-LABEL: test7 -; SSE: xorpd -; SSE-NEXT: addsubpd -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; AVX-NOT: vxorpd -; AVX-NOT: vaddsubpd -; CHECK: ret - - -define <2 x double> @test8b(<2 x double> %A, <2 x double> %B) #0 { - %add = fadd <2 x double> %B, %A - %sub = fsub <2 x double> %A, %B - %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3> - ret <2 x double> %vecinit2 -} -; CHECK-LABEL: test8 -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; CHECK: ret - diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index 18bdcb3..0a5b0ca 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -1,99 +1,120 @@ ; These are tests for SSE3 codegen. -; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \ -; RUN: | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64 ; Test for v8xi16 lowering where we extract the first element of the vector and ; placed it in the second element of the result. define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { +; X64-LABEL: t0: +; X64: ## BB#0: ## %entry +; X64-NEXT: movl $1, %eax +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: retq entry: %tmp3 = load <8 x i16>* %old %tmp6 = shufflevector <8 x i16> %tmp3, - <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, + <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > store <8 x i16> %tmp6, <8 x i16>* %dest ret void - -; X64-LABEL: t0: -; X64: movdqa (%rsi), %xmm0 -; X64: pslldq $2, %xmm0 -; X64: movdqa %xmm0, (%rdi) -; X64: ret } define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; X64-LABEL: t1: +; X64: ## BB#0: +; X64-NEXT: movdqa (%rdi), %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: retq %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp3 -; X64-LABEL: t1: -; X64: movdqa (%rdi), %xmm0 -; X64: pinsrw $0, (%rsi), %xmm0 -; X64: ret } define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t2: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp -; X64-LABEL: t2: -; X64: pextrw $1, %xmm1, %eax -; X64: pinsrw $0, %eax, %xmm0 -; X64: pinsrw $3, %eax, %xmm0 -; X64: ret } define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t3: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 > ret <8 x i16> %tmp -; X64-LABEL: t3: -; X64: pextrw $5, %xmm0, %eax -; X64: pshuflw $44, %xmm0, %xmm0 -; X64: pshufhw $27, %xmm0, %xmm0 -; X64: pinsrw $3, %eax, %xmm0 -; X64: ret } define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t4: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 > ret <8 x i16> %tmp -; X64-LABEL: t4: -; X64: pextrw $7, [[XMM0:%xmm[0-9]+]], %eax -; X64: pshufhw $100, [[XMM0]], [[XMM1:%xmm[0-9]+]] -; X64: pinsrw $1, %eax, [[XMM1]] -; X64: pextrw $1, [[XMM0]], %eax -; X64: pinsrw $4, %eax, %xmm{{[0-9]}} -; X64: ret } define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t5: +; X64: ## BB#0: +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > ret <8 x i16> %tmp -; X64: t5: -; X64: movlhps %xmm1, %xmm0 -; X64: pshufd $114, %xmm0, %xmm0 -; X64: ret } define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t6: +; X64: ## BB#0: +; X64-NEXT: movss %xmm1, %xmm0 +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp -; X64: t6: -; X64: movss %xmm1, %xmm0 -; X64: ret } define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t7: +; X64: ## BB#0: +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 > ret <8 x i16> %tmp -; X64: t7: -; X64: pshuflw $-80, %xmm0, %xmm0 -; X64: pshufhw $-56, %xmm0, %xmm0 -; X64: ret } define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { +; X64-LABEL: t8: +; X64: ## BB#0: +; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: retq %tmp = load <2 x i64>* %A %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> %tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 @@ -115,14 +136,15 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { %tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64> store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res ret void -; X64: t8: -; X64: pshuflw $-58, (%rsi), %xmm0 -; X64: pshufhw $-58, %xmm0, %xmm0 -; X64: movdqa %xmm0, (%rdi) -; X64: ret } define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { +; X64-LABEL: t9: +; X64: ## BB#0: +; X64-NEXT: movapd (%rdi), %xmm0 +; X64-NEXT: movhpd (%rsi), %xmm0 +; X64-NEXT: movapd %xmm0, (%rdi) +; X64-NEXT: retq %tmp = load <4 x float>* %r %tmp.upgrd.3 = bitcast <2 x i32>* %A to double* %tmp.upgrd.4 = load double* %tmp.upgrd.3 @@ -139,11 +161,6 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 store <4 x float> %tmp13, <4 x float>* %r ret void -; X64: t9: -; X64: movaps (%rdi), %xmm0 -; X64: movhps (%rsi), %xmm0 -; X64: movaps %xmm0, (%rdi) -; X64: ret } @@ -154,113 +171,121 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { @g1 = external constant <4 x i32> @g2 = external constant <4 x i16> -define internal void @t10() nounwind { - load <4 x i32>* @g1, align 16 - bitcast <4 x i32> %1 to <8 x i16> - shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > - bitcast <8 x i16> %3 to <2 x i64> - extractelement <2 x i64> %4, i32 0 - bitcast i64 %5 to <4 x i16> - store <4 x i16> %6, <4 x i16>* @g2, align 8 - ret void -; X64: t10: -; X64: pextrw $4, [[X0:%xmm[0-9]+]], %e{{..}} -; X64: pextrw $6, [[X0]], %e{{..}} -; X64: movlhps [[X0]], [[X0]] -; X64: pshuflw $8, [[X0]], [[X0]] -; X64: pinsrw $2, %e{{..}}, [[X0]] -; X64: pinsrw $3, %e{{..}}, [[X0]] +define void @t10() nounwind { +; X64-LABEL: t10: +; X64: ## BB#0: +; X64-NEXT: movq _g1@{{.*}}(%rip), %rax +; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movq _g2@{{.*}}(%rip), %rax +; X64-NEXT: movq %xmm0, (%rax) +; X64-NEXT: retq + load <4 x i32>* @g1, align 16 + bitcast <4 x i32> %1 to <8 x i16> + shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > + bitcast <8 x i16> %3 to <2 x i64> + extractelement <2 x i64> %4, i32 0 + bitcast i64 %5 to <4 x i16> + store <4 x i16> %6, <4 x i16>* @g2, align 8 + ret void } - ; Pack various elements via shuffles. define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t11: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; X64-NEXT: retq entry: %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp7 -; X64-LABEL: t11: -; X64: movd %xmm1, %eax -; X64: movlhps %xmm0, %xmm0 -; X64: pshuflw $1, %xmm0, %xmm0 -; X64: pinsrw $1, %eax, %xmm0 -; X64: ret } - define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t12: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] +; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > ret <8 x i16> %tmp9 -; X64-LABEL: t12: -; X64: pextrw $3, %xmm1, %eax -; X64: movlhps %xmm0, %xmm0 -; X64: pshufhw $3, %xmm0, %xmm0 -; X64: pinsrw $5, %eax, %xmm0 -; X64: ret } - define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t13: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] +; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > ret <8 x i16> %tmp9 -; X64-LABEL: t13: -; X64: punpcklqdq %xmm0, %xmm1 -; X64: pextrw $3, %xmm1, %eax -; X64: pshufhw $12, %xmm1, %xmm0 -; X64: pinsrw $4, %eax, %xmm0 -; X64: ret } - define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t14: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > ret <8 x i16> %tmp9 -; X64-LABEL: t14: -; X64: punpcklqdq %xmm0, %xmm1 -; X64: pshufhw $8, %xmm1, %xmm0 -; X64: ret } - ; FIXME: t15 is worse off from disabling of scheduler 2-address hack. define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t15: +; X64: ## BB#0: ## %entry +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; X64-NEXT: retq entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -; X64: t15: -; X64: pextrw $7, %xmm0, %eax -; X64: punpcklqdq %xmm1, %xmm0 -; X64: pshuflw $-128, %xmm0, %xmm0 -; X64: pinsrw $2, %eax, %xmm0 -; X64: ret + %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > + ret <8 x i16> %tmp8 } - ; Test yonah where we convert a shuffle to pextrw and pinrsw define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { +; X64-LABEL: t16: +; X64: ## BB#0: ## %entry +; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: packuswb %xmm0, %xmm0 +; X64-NEXT: retq entry: - %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - ret <16 x i8> %tmp9 -; X64: t16: -; X64: pextrw $8, %xmm0, %eax -; X64: pslldq $2, %xmm0 -; X64: pextrw $1, %xmm0, %ecx -; X64: movzbl %cl, %ecx -; X64: orl %eax, %ecx -; X64: pinsrw $1, %ecx, %xmm0 -; X64: ret + %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + ret <16 x i8> %tmp9 } ; rdar://8520311 define <4 x i32> @t17() nounwind { -entry: ; X64-LABEL: t17: -; X64: movddup (%rax), %xmm0 +; X64: ## BB#0: ## %entry +; X64-NEXT: movddup (%rax), %xmm0 +; X64-NEXT: andpd {{.*}}(%rip), %xmm0 +; X64-NEXT: retq +entry: %tmp1 = load <4 x float>* undef, align 16 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3> %tmp3 = load <4 x float>* undef, align 16 diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll deleted file mode 100644 index 3a48121..0000000 --- a/test/CodeGen/X86/sse41-blend.ll +++ /dev/null @@ -1,140 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s - -;CHECK-LABEL: vsel_float: -;CHECK: blendps -;CHECK: ret -define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2 - ret <4 x float> %vsel -} - - -;CHECK-LABEL: vsel_4xi8: -;CHECK: blendps -;CHECK: ret -define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { - %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2 - ret <4 x i8> %vsel -} - -;CHECK-LABEL: vsel_4xi16: -;CHECK: blendps -;CHECK: ret -define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2 - ret <4 x i16> %vsel -} - - -;CHECK-LABEL: vsel_i32: -;CHECK: blendps -;CHECK: ret -define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { - %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2 - ret <4 x i32> %vsel -} - - -;CHECK-LABEL: vsel_double: -;CHECK: movsd -;CHECK: ret -define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2 - ret <4 x double> %vsel -} - - -;CHECK-LABEL: vsel_i64: -;CHECK: movsd -;CHECK: ret -define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2 - ret <4 x i64> %vsel -} - - -;CHECK-LABEL: vsel_i8: -;CHECK: pblendvb -;CHECK: ret -define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { - %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2 - ret <16 x i8> %vsel -} - -;; TEST blend + compares -; CHECK: A -define <2 x double> @A(<2 x double> %x, <2 x double> %y) { - ; CHECK: cmplepd - ; CHECK: blendvpd - %max_is_x = fcmp oge <2 x double> %x, %y - %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %max -} - -; CHECK: B -define <2 x double> @B(<2 x double> %x, <2 x double> %y) { - ; CHECK: cmpnlepd - ; CHECK: blendvpd - %min_is_x = fcmp ult <2 x double> %x, %y - %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %min -} - -; CHECK: float_crash -define void @float_crash() nounwind { -entry: - %merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef - %extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0 - store double %extract214vector_func.i, double addrspace(1)* undef, align 8 - ret void -} - -; If we can figure out a blend has a constant mask, we should emit the -; blend instruction with an immediate mask -define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) { -; In this case, we emit a simple movss -; CHECK-LABEL: constant_blendvpd -; CHECK: movsd -; CHECK: ret - %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab - ret <2 x double> %1 -} - -define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) { -; CHECK-LABEL: constant_blendvps -; CHECK-NOT: mov -; CHECK: blendps $7 -; CHECK: ret - %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd - ret <4 x float> %1 -} - -define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) { -; CHECK-LABEL: constant_pblendvb: -; CHECK: movaps -; CHECK: pblendvb -; CHECK: ret - %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd - ret <16 x i8> %1 -} - -declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) -declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) - -;; 2 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_shufflevector_4xfloat -; CHECK: blendps $6, %xmm1, %xmm0 -; CHECK: ret -define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3> - ret <4 x float> %1 -} - -; CHECK-LABEL: @blend_shufflevector_8xi16 -; CHECK: pblendw $134, %xmm1, %xmm0 -; CHECK: ret -define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) { - %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15> - ret <8 x i16> %1 -} diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll new file mode 100644 index 0000000..6fab98e --- /dev/null +++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -0,0 +1,61 @@ +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s +; This test works just like the non-upgrade one except that it only checks +; forms which require auto-upgrading. + +define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { + ; CHECK: blendpd + %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone + + +define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: blendps + %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone + + +define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) { + ; CHECK: dppd + %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone + + +define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: dpps + %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone + + +define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: insertps + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone + + + +define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { + ; CHECK: mpsadbw + %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone + + +define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { + ; CHECK: pblendw + %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone + + diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll index 37eff43..5f25a16 100644 --- a/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -2,18 +2,18 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: blendpd - %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: blendps - %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { @@ -34,35 +34,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: dppd - %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: dpps - %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: insertps - %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: mpsadbw - %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) { @@ -83,10 +83,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK: pblendw - %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) { diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 6726a3e..d5c6f74 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -1,30 +1,47 @@ -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK +; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64 @g16 = external global i16 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { - %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 - ret <4 x i32> %tmp1 ; X32-LABEL: pinsrd_1: -; X32: pinsrd $1, 4(%esp), %xmm0 - +; X32: ## BB#0: +; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; ; X64-LABEL: pinsrd_1: -; X64: pinsrd $1, %edi, %xmm0 +; X64: ## BB#0: +; X64-NEXT: pinsrd $1, %edi, %xmm0 +; X64-NEXT: retq + %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 + ret <4 x i32> %tmp1 } define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { - %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 - ret <16 x i8> %tmp1 ; X32-LABEL: pinsrb_1: -; X32: pinsrb $1, 4(%esp), %xmm0 - +; X32: ## BB#0: +; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; ; X64-LABEL: pinsrb_1: -; X64: pinsrb $1, %edi, %xmm0 +; X64: ## BB#0: +; X64-NEXT: pinsrb $1, %edi, %xmm0 +; X64-NEXT: retq + %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 + ret <16 x i8> %tmp1 } - define <2 x i64> @pmovsxbd_1(i32* %p) nounwind { +; X32-LABEL: pmovsxbd_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pmovsxbd (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovsxbd_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: pmovsxbd (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load i32* %p, align 4 %1 = insertelement <4 x i32> undef, i32 %0, i32 0 @@ -35,16 +52,19 @@ entry: %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone %7 = bitcast <4 x i32> %6 to <2 x i64> ret <2 x i64> %7 - -; X32: _pmovsxbd_1: -; X32: movl 4(%esp), %eax -; X32: pmovsxbd (%eax), %xmm0 - -; X64: _pmovsxbd_1: -; X64: pmovsxbd (%rdi), %xmm0 } define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly { +; X32-LABEL: pmovsxwd_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pmovsxwd (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovsxwd_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: pmovsxwd (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load i64* %p ; <i64> [#uses=1] %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1] @@ -52,63 +72,59 @@ entry: %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1] %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %3 - -; X32: _pmovsxwd_1: -; X32: movl 4(%esp), %eax -; X32: pmovsxwd (%eax), %xmm0 - -; X64: _pmovsxwd_1: -; X64: pmovsxwd (%rdi), %xmm0 } - - - define <2 x i64> @pmovzxbq_1() nounwind { +; X32-LABEL: pmovzxbq_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl L_g16$non_lazy_ptr, %eax +; X32-NEXT: pmovzxbq (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovzxbq_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: movq _g16@{{.*}}(%rip), %rax +; X64-NEXT: pmovzxbq (%rax), %xmm0 +; X64-NEXT: retq entry: %0 = load i16* @g16, align 2 ; <i16> [#uses=1] %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] ret <2 x i64> %3 - -; X32: _pmovzxbq_1: -; X32: movl L_g16$non_lazy_ptr, %eax -; X32: pmovzxbq (%eax), %xmm0 - -; X64: _pmovzxbq_1: -; X64: movq _g16@GOTPCREL(%rip), %rax -; X64: pmovzxbq (%rax), %xmm0 } declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone - - - define i32 @extractps_1(<4 x float> %v) nounwind { +; X32-LABEL: extractps_1: +; X32: ## BB#0: +; X32-NEXT: extractps $3, %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: extractps_1: +; X64: ## BB#0: +; X64-NEXT: extractps $3, %xmm0, %eax +; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 %i = bitcast float %s to i32 ret i32 %i - -; X32: _extractps_1: -; X32: extractps $3, %xmm0, %eax - -; X64: _extractps_1: -; X64: extractps $3, %xmm0, %eax } define i32 @extractps_2(<4 x float> %v) nounwind { +; X32-LABEL: extractps_2: +; X32: ## BB#0: +; X32-NEXT: extractps $3, %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: extractps_2: +; X64: ## BB#0: +; X64-NEXT: extractps $3, %xmm0, %eax +; X64-NEXT: retq %t = bitcast <4 x float> %v to <4 x i32> %s = extractelement <4 x i32> %t, i32 3 ret i32 %s - -; X32: _extractps_2: -; X32: extractps $3, %xmm0, %eax - -; X64: _extractps_2: -; X64: extractps $3, %xmm0, %eax } @@ -117,106 +133,152 @@ define i32 @extractps_2(<4 x float> %v) nounwind { ; is bitcasted to i32, but unsuitable for much of anything else. define float @ext_1(<4 x float> %v) nounwind { +; X32-LABEL: ext_1: +; X32: ## BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-NEXT: addss LCPI7_0, %xmm0 +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: ext_1: +; X64: ## BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 %t = fadd float %s, 1.0 ret float %t - -; X32: _ext_1: -; X32: pshufd $3, %xmm0, %xmm0 -; X32: addss LCPI7_0, %xmm0 - -; X64: _ext_1: -; X64: pshufd $3, %xmm0, %xmm0 -; X64: addss LCPI7_0(%rip), %xmm0 } define float @ext_2(<4 x float> %v) nounwind { +; X32-LABEL: ext_2: +; X32: ## BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: ext_2: +; X64: ## BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 ret float %s - -; X32: _ext_2: -; X32: pshufd $3, %xmm0, %xmm0 - -; X64: _ext_2: -; X64: pshufd $3, %xmm0, %xmm0 } define i32 @ext_3(<4 x i32> %v) nounwind { +; X32-LABEL: ext_3: +; X32: ## BB#0: +; X32-NEXT: pextrd $3, %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: ext_3: +; X64: ## BB#0: +; X64-NEXT: pextrd $3, %xmm0, %eax +; X64-NEXT: retq %i = extractelement <4 x i32> %v, i32 3 ret i32 %i - -; X32: _ext_3: -; X32: pextrd $3, %xmm0, %eax - -; X64: _ext_3: -; X64: pextrd $3, %xmm0, %eax } define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { - %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone - ret <4 x float> %tmp1 -; X32: _insertps_1: -; X32: insertps $1, %xmm1, %xmm0 - -; X64: _insertps_1: -; X64: insertps $1, %xmm1, %xmm0 +; X32-LABEL: insertps_1: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_1: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] +; X64-NEXT: retq + %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone + ret <4 x float> %tmp1 } declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind { - %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 - ret <4 x float> %tmp1 -; X32: _insertps_2: -; X32: insertps $0, 4(%esp), %xmm0 - -; X64: _insertps_2: -; X64: insertps $0, %xmm1, %xmm0 +; X32-LABEL: insertps_2: +; X32: ## BB#0: +; X32-NEXT: insertps $0, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_2: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 + ret <4 x float> %tmp1 } - define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind { - %tmp2 = extractelement <4 x float> %t2, i32 0 - %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 - ret <4 x float> %tmp1 -; X32: _insertps_3: -; X32: insertps $0, %xmm1, %xmm0 - -; X64: _insertps_3: -; X64: insertps $0, %xmm1, %xmm0 +; X32-LABEL: insertps_3: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_3: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %tmp2 = extractelement <4 x float> %t2, i32 0 + %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 + ret <4 x float> %tmp1 } define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { - %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone - ret i32 %tmp1 -; X32: _ptestz_1: -; X32: ptest %xmm1, %xmm0 -; X32: sete %al - -; X64: _ptestz_1: -; X64: ptest %xmm1, %xmm0 -; X64: sete %al +; X32-LABEL: ptestz_1: +; X32: ## BB#0: +; X32-NEXT: ptest %xmm1, %xmm0 +; X32-NEXT: sete %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: ptestz_1: +; X64: ## BB#0: +; X64-NEXT: ptest %xmm1, %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone + ret i32 %tmp1 } define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { - %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone - ret i32 %tmp1 -; X32: _ptestz_2: -; X32: ptest %xmm1, %xmm0 -; X32: sbbl %eax - -; X64: _ptestz_2: -; X64: ptest %xmm1, %xmm0 -; X64: sbbl %eax +; X32-LABEL: ptestz_2: +; X32: ## BB#0: +; X32-NEXT: ptest %xmm1, %xmm0 +; X32-NEXT: sbbl %eax, %eax +; X32-NEXT: andl $1, %eax +; X32-NEXT: retl +; +; X64-LABEL: ptestz_2: +; X64: ## BB#0: +; X64-NEXT: ptest %xmm1, %xmm0 +; X64-NEXT: sbbl %eax, %eax +; X64-NEXT: andl $1, %eax +; X64-NEXT: retq + %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone + ret i32 %tmp1 } define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { - %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone - ret i32 %tmp1 -; X32: _ptestz_3: -; X32: ptest %xmm1, %xmm0 -; X32: seta %al - -; X64: _ptestz_3: -; X64: ptest %xmm1, %xmm0 -; X64: seta %al +; X32-LABEL: ptestz_3: +; X32: ## BB#0: +; X32-NEXT: ptest %xmm1, %xmm0 +; X32-NEXT: seta %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: ptestz_3: +; X64: ## BB#0: +; X64-NEXT: ptest %xmm1, %xmm0 +; X64-NEXT: seta %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone + ret i32 %tmp1 } @@ -227,6 +289,25 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone ; This used to compile to insertps $0 + insertps $16. insertps $0 is always ; pointless. define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { +; X32-LABEL: buildvector: +; X32: ## BB#0: ## %entry +; X32-NEXT: movaps %xmm0, %xmm2 +; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X32-NEXT: addss %xmm1, %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: addss %xmm2, %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X32-NEXT: retl +; +; X64-LABEL: buildvector: +; X64: ## BB#0: ## %entry +; X64-NEXT: movaps %xmm0, %xmm2 +; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X64-NEXT: addss %xmm1, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: addss %xmm2, %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64-NEXT: retq entry: %tmp7 = extractelement <2 x float> %A, i32 0 %tmp5 = extractelement <2 x float> %A, i32 1 @@ -237,97 +318,124 @@ entry: %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 ret <2 x float> %tmp9 -; X32-LABEL: buildvector: -; X32-NOT: insertps $0 -; X32: insertps $16 -; X32-NOT: insertps $0 -; X32: ret -; X64-LABEL: buildvector: -; X64-NOT: insertps $0 -; X64: insertps $16 -; X64-NOT: insertps $0 -; X64: ret } define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; X32-LABEL: insertps_from_shufflevector_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_shufflevector_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load <4 x float>* %pb, align 16 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x float> %vecinit6 -; CHECK-LABEL: insertps_from_shufflevector_1: -; CHECK-NOT: movss -; CHECK-NOT: shufps -; CHECK: insertps $48, -; CHECK: ret } define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { +; X32-LABEL: insertps_from_shufflevector_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_shufflevector_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X64-NEXT: retq entry: %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> ret <4 x float> %vecinit6 -; CHECK-LABEL: insertps_from_shufflevector_2: -; CHECK-NOT: shufps -; CHECK: insertps $96, -; CHECK: ret } ; For loading an i32 from memory into an xmm register we use pinsrd ; instead of insertps define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { +; X32-LABEL: pinsrd_from_shufflevector_i32: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pinsrd_from_shufflevector_i32: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load <4 x i32>* %pb, align 16 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x i32> %vecinit6 -; CHECK-LABEL: pinsrd_from_shufflevector_i32: -; CHECK-NOT: movss -; CHECK-NOT: shufps -; CHECK: pinsrd $3, -; CHECK: ret } define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { +; X32-LABEL: insertps_from_shufflevector_i32_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_shufflevector_i32_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3] +; X64-NEXT: retq entry: %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> ret <4 x i32> %vecinit6 -; CHECK-LABEL: insertps_from_shufflevector_i32_2: -; CHECK-NOT: shufps -; CHECK-NOT: movaps -; CHECK: insertps $208, -; CHECK: ret } define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { -; CHECK-LABEL: insertps_from_load_ins_elt_undef: -; CHECK-NOT: movss -; CHECK-NOT: shufps -; CHECK: insertps $16, -; CHECK: ret +; X32-LABEL: insertps_from_load_ins_elt_undef: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $16, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_load_ins_elt_undef: +; X64: ## BB#0: +; X64-NEXT: insertps $16, (%rdi), %xmm0 +; X64-NEXT: retq %1 = load float* %b, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> ret <4 x float> %result } -define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { -; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32: ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr -;; aCHECK-NOT: movd -; CHECK-NOT: shufps -; CHECK: insertps $32, -; CHECK: ret +define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { +; X32-LABEL: insertps_from_load_ins_elt_undef_i32: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movd (%eax), %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_load_ins_elt_undef_i32: +; X64: ## BB#0: +; X64-NEXT: movd (%rdi), %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X64-NEXT: retq %1 = load i32* %b, align 4 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> ret <4 x i32> %result } -;;;;;; Shuffles optimizable with a single insertps instruction +;;;;;; Shuffles optimizable with a single insertps or blend instruction define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XYZ0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $8 -; CHECK: ret +; X32-LABEL: shuf_XYZ0: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; X32-NEXT: retl +; +; X64-LABEL: shuf_XYZ0: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -339,11 +447,15 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XY00: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $12 -; CHECK: ret +; X32-LABEL: shuf_XY00: +; X32: ## BB#0: +; X32-NEXT: movq %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shuf_XY00: +; X64: ## BB#0: +; X64-NEXT: movq %xmm0, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -354,11 +466,15 @@ define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XYY0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $104 -; CHECK: ret +; X32-LABEL: shuf_XYY0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X32-NEXT: retl +; +; X64-LABEL: shuf_XYY0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -369,9 +485,15 @@ define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XYW0: -; CHECK: insertps $232 -; CHECK: ret +; X32-LABEL: shuf_XYW0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X32-NEXT: retl +; +; X64-LABEL: shuf_XYW0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -383,11 +505,15 @@ define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_W00W: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $198 -; CHECK: ret +; X32-LABEL: shuf_W00W: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: shuf_W00W: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 3 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -397,11 +523,19 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_X00A: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: shuf_X00A: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: shuf_X00A: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -411,11 +545,21 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_X00X: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: shuf_X00X: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shuf_X00X: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -425,12 +569,23 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_X0YC: -; CHECK: shufps -; CHECK-NOT: movhlps -; CHECK-NOT: shufps -; CHECK: insertps $176 -; CHECK: ret +; X32-LABEL: shuf_X0YC: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X32-NEXT: movaps %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shuf_X0YC: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X64-NEXT: movaps %xmm2, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -440,11 +595,17 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { } define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XYZ0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $8 -; CHECK: ret +; X32-LABEL: i32_shuf_XYZ0: +; X32: ## BB#0: +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XYZ0: +; X64: ## BB#0: +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -456,11 +617,15 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XY00: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $12 -; CHECK: ret +; X32-LABEL: i32_shuf_XY00: +; X32: ## BB#0: +; X32-NEXT: movq %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XY00: +; X64: ## BB#0: +; X64-NEXT: movq %xmm0, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -471,11 +636,15 @@ define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XYY0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $104 -; CHECK: ret +; X32-LABEL: i32_shuf_XYY0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XYY0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -486,11 +655,15 @@ define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XYW0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $232 -; CHECK: ret +; X32-LABEL: i32_shuf_XYW0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XYW0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -502,11 +675,15 @@ define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_W00W: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $198 -; CHECK: ret +; X32-LABEL: i32_shuf_W00W: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_W00W: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 3 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -516,11 +693,19 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_X00A: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: i32_shuf_X00A: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_X00A: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -530,11 +715,21 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_X00X: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: i32_shuf_X00X: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_X00X: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -544,12 +739,23 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_X0YC: -; CHECK: shufps -; CHECK-NOT: movhlps -; CHECK-NOT: shufps -; CHECK: insertps $176 -; CHECK: ret +; X32-LABEL: i32_shuf_X0YC: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X32-NEXT: movaps %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_X0YC: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X64-NEXT: movaps %xmm2, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -560,11 +766,19 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ;; Test for a bug in the first implementation of LowerBuildVectorv4x32 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { -; CHECK-LABEL: test_insertps_no_undef: -; CHECK: movaps %xmm0, %xmm1 -; CHECK-NEXT: insertps $8, %xmm1, %xmm1 -; CHECK-NEXT: maxps %xmm1, %xmm0 -; CHECK-NEXT: ret +; X32-LABEL: test_insertps_no_undef: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; X32-NEXT: maxps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_insertps_no_undef: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; X64-NEXT: maxps %xmm1, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -578,48 +792,75 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { } define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { -; CHECK-LABEL: blendvb_fallback -; CHECK: blendvb -; CHECK: ret +; X32-LABEL: blendvb_fallback: +; X32: ## BB#0: +; X32-NEXT: psllw $15, %xmm0 +; X32-NEXT: psraw $15, %xmm0 +; X32-NEXT: pblendvb %xmm1, %xmm2 +; X32-NEXT: movdqa %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: blendvb_fallback: +; X64: ## BB#0: +; X64-NEXT: psllw $15, %xmm0 +; X64-NEXT: psraw $15, %xmm0 +; X64-NEXT: pblendvb %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: retq %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y ret <8 x i16> %ret } -define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { -; CHECK-LABEL: insertps_from_vector_load: ; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK-NEXT: ret +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; X32-LABEL: insertps_from_vector_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_vector_load: +; X64: ## BB#0: +; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: retq %1 = load <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) ret <4 x float> %2 } ;; Use a non-zero CountS for insertps -define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { -; CHECK-LABEL: insertps_from_vector_load_offset: -; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax -; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $96, 4(%{{...}}), % -; CHECK-NEXT: ret +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; X32-LABEL: insertps_from_vector_load_offset: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $96, 4(%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_vector_load_offset: +; X64: ## BB#0: +; X64-NEXT: insertps $96, 4(%rdi), %xmm0 +; X64-NEXT: retq %1 = load <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) ret <4 x float> %2 } -define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { -; CHECK-LABEL: insertps_from_vector_load_offset_2: -; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax -; X32: movl 8(%esp), %ecx -; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $192, 12(%{{...}},%{{...}}), % -; CHECK-NEXT: ret +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; X32-LABEL: insertps_from_vector_load_offset_2: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shll $4, %ecx +; X32-NEXT: insertps $-64, 12(%eax,%ecx), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_vector_load_offset_2: +; X64: ## BB#0: +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: insertps $-64, 12(%rdi,%rsi), %xmm0 +; X64-NEXT: retq %1 = getelementptr inbounds <4 x float>* %pb, i64 %index %2 = load <4 x float>* %1, align 16 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) @@ -627,13 +868,21 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa } define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { -; CHECK-LABEL: insertps_from_broadcast_loadf32: -; On X32, account for the arguments' move to registers -; X32: movl 8(%esp), %eax -; X32: movl 4(%esp), %ecx -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK-NEXT: ret +; X32-LABEL: insertps_from_broadcast_loadf32: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss (%ecx,%eax,4), %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_broadcast_loadf32: +; X64: ## BB#0: +; X64-NEXT: movss (%rdi,%rsi,4), %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %1 = getelementptr inbounds float* %fb, i64 %index %2 = load float* %1, align 4 %3 = insertelement <4 x float> undef, float %2, i32 0 @@ -645,12 +894,20 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap } define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { -; CHECK-LABEL: insertps_from_broadcast_loadv4f32: -; On X32, account for the arguments' move to registers -; X32: movl 4(%esp), %{{...}} -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK-NEXT: ret +; X32-LABEL: insertps_from_broadcast_loadv4f32: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movups (%eax), %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_broadcast_loadv4f32: +; X64: ## BB#0: +; X64-NEXT: movups (%rdi), %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %1 = load <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 %3 = insertelement <4 x float> undef, float %2, i32 0 @@ -663,20 +920,33 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ;; FIXME: We're emitting an extraneous pshufd/vbroadcast. define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { -; CHECK-LABEL: insertps_from_broadcast_multiple_use: -; On X32, account for the arguments' move to registers -; X32: movl 8(%esp), %eax -; X32: movl 4(%esp), %ecx -; CHECK: movss -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: addps -; CHECK: addps -; CHECK: addps -; CHECK-NEXT: ret +; X32-LABEL: insertps_from_broadcast_multiple_use: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss (%ecx,%eax,4), %xmm4 +; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] +; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: addps %xmm2, %xmm3 +; X32-NEXT: addps %xmm3, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_broadcast_multiple_use: +; X64: ## BB#0: +; X64-NEXT: movss (%rdi,%rsi,4), %xmm4 +; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] +; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: addps %xmm2, %xmm3 +; X64-NEXT: addps %xmm3, %xmm0 +; X64-NEXT: retq %1 = getelementptr inbounds float* %fb, i64 %index %2 = load float* %1, align 4 %3 = insertelement <4 x float> undef, float %2, i32 0 @@ -694,10 +964,20 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl } define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { -; CHECK-LABEL: insertps_with_undefs: -; CHECK-NOT: shufps -; CHECK: insertps $32, %xmm0 -; CHECK: ret +; X32-LABEL: insertps_with_undefs: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss (%eax), %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_with_undefs: +; X64: ## BB#0: +; X64-NEXT: movss (%rdi), %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %1 = load float* %b, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> @@ -707,10 +987,162 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using ; the destination index to change the load, instead of the source index. define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { -; CHECK-LABEL: pr20087: -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: pr20087: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $-78, 8(%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pr20087: +; X64: ## BB#0: +; X64-NEXT: insertps $-78, 8(%rdi), %xmm0 +; X64-NEXT: retq %load = load <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> ret <4 x float> %ret } + +; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> +define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { +; X32-LABEL: insertps_pr20411: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; X32-NEXT: insertps $-36, LCPI49_1+12, %xmm0 +; X32-NEXT: movups %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: insertps_pr20411: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; X64-NEXT: insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0 +; X64-NEXT: movups %xmm0, (%rdi) +; X64-NEXT: retq + %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; 4 5 6 7 + %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x + %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x + %ptrcast = bitcast i32* %RET to <4 x i32>* + store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 + ret void +} + +define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_4: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_4: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 + %vecext2 = extractelement <4 x float> %B, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_5: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_5: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %B, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_6: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_6: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 1 + %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 + %vecext1 = extractelement <4 x float> %B, i32 2 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit3 +} + +define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_7: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_7: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 + %vecext2 = extractelement <4 x float> %B, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_8: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_8: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %B, i32 0 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_9: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_9: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 + %vecext1 = extractelement <4 x float> %B, i32 2 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit3 +} diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll index cf88ade..cf0f999 100644 --- a/test/CodeGen/X86/stack-protector-dbginfo.ll +++ b/test/CodeGen/X86/stack-protector-dbginfo.ll @@ -10,88 +10,88 @@ ; Function Attrs: nounwind sspreq define i32 @_Z18read_response_sizev() #0 { entry: - tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39 + tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !39 %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40 - tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71 + tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !71 %1 = trunc i64 %0 to i32 ret i32 %1 } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) attributes #0 = { sspreq } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!21, !72} -!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus] +!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus] !1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"} !2 = metadata !{metadata !3} -!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ] -!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ] +!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !1, metadata !4, null, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ] +!4 = metadata !{metadata !"0x13\00C\0019\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ] !5 = metadata !{} !6 = metadata !{metadata !7} -!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0] +!7 = metadata !{metadata !"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0] !8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65} -!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size] -!10 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>] -!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!9 = metadata !{metadata !"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", metadata !1, metadata !10, metadata !11, null, i32 ()* @_Z18read_response_sizev, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size] +!10 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>] +!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !12 = metadata !{metadata !13} -!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] !14 = metadata !{metadata !15, metadata !19} -!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28] -!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ] +!15 = metadata !{metadata !"0x100\00b\0028\000", metadata !9, metadata !10, metadata !16} ; [ DW_TAG_auto_variable ] [b] [line 28] +!16 = metadata !{metadata !"0x13\00B\0016\0032\0032\000\000\000", metadata !1, null, null, metadata !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ] !17 = metadata !{metadata !18} -!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int] -!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29] +!18 = metadata !{metadata !"0xd\00end_of_file\0017\0032\0032\000\000", metadata !1, metadata !16, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int] +!19 = metadata !{metadata !"0x100\00c\0029\000", metadata !9, metadata !10, metadata !13} ; [ DW_TAG_auto_variable ] [c] [line 29] !20 = metadata !{} !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} !22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)} -!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12] -!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>] -!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1] +!23 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12] +!24 = metadata !{metadata !"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", metadata !1, metadata !25, metadata !27, null, null, metadata !33, null, metadata !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>] +!25 = metadata !{metadata !"0x39\00__1\001", metadata !26, null} ; [ DW_TAG_namespace ] [__1] [line 1] !26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"} -!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !28 = metadata !{metadata !29, metadata !29, metadata !32} -!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] -!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] -!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned] -!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] +!29 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] +!30 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] +!31 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned] +!32 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] !33 = metadata !{metadata !34} -!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ] +!34 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !31, null} ; [ DW_TAG_template_type_parameter ] !35 = metadata !{metadata !36, metadata !37} -!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12] -!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12] +!36 = metadata !{metadata !"0x101\00p1\0016777228\000", metadata !24, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 12] +!37 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 12] !38 = metadata !{i32 33, i32 0, metadata !9, null} !39 = metadata !{i32 12, i32 0, metadata !24, metadata !38} !40 = metadata !{i32 9, i32 0, metadata !41, metadata !59} -!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>] -!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!41 = metadata !{metadata !"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", metadata !1, metadata !25, metadata !42, null, null, metadata !53, null, metadata !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>] +!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44} -!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ] +!44 = metadata !{metadata !"0x13\00A\000\008\008\000\000\000", metadata !1, metadata !25, null, metadata !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ] !45 = metadata !{metadata !46} -!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()] -!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!46 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", metadata !1, metadata !44, metadata !47, null, null, null, i32 0, metadata !52} ; [ DW_TAG_subprogram ] [line 1] [operator()] +!47 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50} -!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A] -!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] -!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int] +!49 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A] +!50 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] +!51 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int] !52 = metadata !{i32 786468} !53 = metadata !{metadata !34, metadata !54} -!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ] +!54 = metadata !{metadata !"0x2f\00_Compare\000\000", null, metadata !44, null} ; [ DW_TAG_template_type_parameter ] !55 = metadata !{metadata !56, metadata !57, metadata !58} -!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7] -!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7] -!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8] +!56 = metadata !{metadata !"0x101\00p1\0016777223\000", metadata !41, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 7] +!57 = metadata !{metadata !"0x101\00p2\0033554439\000", metadata !41, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 7] +!58 = metadata !{metadata !"0x101\00p3\0050331656\000", metadata !41, metadata !10, metadata !44} ; [ DW_TAG_arg_variable ] [p3] [line 8] !59 = metadata !{i32 13, i32 0, metadata !24, metadata !38} !63 = metadata !{i32 undef} -!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1] -!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()] +!64 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1] +!65 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", metadata !1, metadata !25, metadata !47, null, null, null, metadata !46, metadata !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()] !66 = metadata !{metadata !67, metadata !69, metadata !70} -!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0] -!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A] -!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1] -!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2] +!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !65, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0] +!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A] +!69 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [p1] [line 1] +!70 = metadata !{metadata !"0x101\00\0050331650\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [line 2] !71 = metadata !{i32 1, i32 0, metadata !65, metadata !40} -!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll new file mode 100644 index 0000000..dd639a7 --- /dev/null +++ b/test/CodeGen/X86/stack_guard_remat.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK + +;CHECK: foo2 +;CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]] +;CHECK: movq ([[R0]]), {{%[a-z0-9]+}} + +; Function Attrs: nounwind ssp uwtable +define i32 @test_stack_guard_remat() #0 { +entry: + %a1 = alloca [256 x i32], align 16 + %0 = bitcast [256 x i32]* %a1 to i8* + call void @llvm.lifetime.start(i64 1024, i8* %0) + %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0 + call void @foo3(i32* %arraydecay) + call void asm sideeffect "foo2", "~{r12},~{r13},~{r14},~{r15},~{ebx},~{esi},~{edi},~{dirflag},~{fpsr},~{flags}"() + call void @llvm.lifetime.end(i64 1024, i8* %0) + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) + +declare void @foo3(i32*) + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll index 0b7e6db..dfb16ad 100644 --- a/test/CodeGen/X86/stackmap-fast-isel.ll +++ b/test/CodeGen/X86/stackmap-fast-isel.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort | FileCheck %s ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll new file mode 100644 index 0000000..73ee4f3 --- /dev/null +++ b/test/CodeGen/X86/stackmap-large-constants.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s + +; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; version +; CHECK-NEXT: .byte 1 +; reserved +; CHECK-NEXT: .byte 0 +; reserved +; CHECK-NEXT: .short 0 +; # functions +; CHECK-NEXT: .long 2 +; # constants +; CHECK-NEXT: .long 2 +; # records +; CHECK-NEXT: .long 2 +; function address & stack size +; CHECK-NEXT: .quad _foo +; CHECK-NEXT: .quad 8 +; function address & stack size +; CHECK-NEXT: .quad _bar +; CHECK-NEXT: .quad 8 + +; Constants Array: +; CHECK-NEXT: .quad 9223372036854775807 +; CHECK-NEXT: .quad -9223372036854775808 + +; Patchpoint ID +; CHECK-NEXT: .quad 0 +; Instruction offset +; CHECK-NEXT: .long L{{.*}}-_foo +; reserved +; CHECK-NEXT: .short 0 +; # locations +; CHECK-NEXT: .short 1 +; ConstantIndex +; CHECK-NEXT: .byte 5 +; reserved +; CHECK-NEXT: .byte 8 +; Dwarf RegNum +; CHECK-NEXT: .short 0 +; Offset +; CHECK-NEXT: .long 0 +; padding +; CHECK-NEXT: .short 0 +; NumLiveOuts +; CHECK-NEXT: .short 0 + +; CHECK-NEXT: .align 3 + +declare void @llvm.experimental.stackmap(i64, i32, ...) + +define void @foo() { + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807) + ret void +} + +; Patchpoint ID +; CHECK-NEXT: .quad 0 +; Instruction Offset +; CHECK-NEXT: .long L{{.*}}-_bar +; reserved +; CHECK-NEXT: .short 0 +; # locations +; CHECK-NEXT: .short 1 +; ConstantIndex +; CHECK-NEXT: .byte 5 +; reserved +; CHECK-NEXT: .byte 8 +; Dwarf RegNum +; CHECK-NEXT: .short 0 +; Offset +; CHECK-NEXT: .long 1 +; padding +; CHECK-NEXT: .short 0 +; NumLiveOuts +; CHECK-NEXT: .short 0 + + +define void @bar() { + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808) + ret void +} diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll index 897595d..31553c0 100644 --- a/test/CodeGen/X86/stackmap-liveness.ll +++ b/test/CodeGen/X86/stackmap-liveness.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness=false | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck -check-prefix=PATCH %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -enable-patchpoint-liveness=false | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck -check-prefix=PATCH %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll index 5a78f24..7932c0d 100644 --- a/test/CodeGen/X86/stackmap-nops.ll +++ b/test/CodeGen/X86/stackmap-nops.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s define void @nop_test() { entry: @@ -224,6 +224,10 @@ entry: tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28) tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29) tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30) +; Add an extra stackmap with a zero-length shadow to thwart the shadow +; optimization. This will force all 15 bytes of the previous shadow to be +; padded with nops. + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0) ret void } diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll new file mode 100644 index 0000000..a3725f2 --- /dev/null +++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s + +; Check that the X86 stackmap shadow optimization is only outputting a 3-byte +; nop here. 8-bytes are requested, but 5 are covered by the code for the call to +; bar. However, the frame teardown and the return do not count towards the +; stackmap shadow as the call return counts as a branch target so must flush +; the shadow. +; Note that in order for a thread to not return in to the patched space +; the call must be at the end of the shadow, so the required nop must be +; before the call, not after. +define void @shadow_optimization_test() { +entry: +; CHECK-LABEL: shadow_optimization_test: +; CHECK: callq _bar +; CHECK: nop +; CHECK: callq _bar +; CHECK-NOT: nop +; CHECK: callq _bar +; CHECK-NOT: nop + call void @bar() + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8) + call void @bar() + call void @bar() + ret void +} + +declare void @llvm.experimental.stackmap(i64, i32, ...) +declare void @bar() diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll index 8567037..5e356f3 100644 --- a/test/CodeGen/X86/stackmap.ll +++ b/test/CodeGen/X86/stackmap.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. @@ -9,11 +9,11 @@ ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 0 ; Num Functions -; CHECK-NEXT: .long 15 +; CHECK-NEXT: .long 16 ; Num LargeConstants ; CHECK-NEXT: .long 3 ; Num Callsites -; CHECK-NEXT: .long 19 +; CHECK-NEXT: .long 20 ; Functions and stack size ; CHECK-NEXT: .quad _constantargs @@ -46,6 +46,8 @@ ; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad _clobberScratch ; CHECK-NEXT: .quad 56 +; CHECK-NEXT: .quad _needsStackRealignment +; CHECK-NEXT: .quad -1 ; Large Constants ; CHECK-NEXT: .quad 2147483648 @@ -464,6 +466,23 @@ define void @clobberScratch(i32 %a) { ret void } +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. +; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment +; CHECK-NEXT: .short 0 +; 0 locations +; CHECK-NEXT: .short 0 +define void @needsStackRealignment() { + %val = alloca i64, i32 3, align 128 + tail call void (...)* @escape_values(i64* %val) +; Note: Adding any non-constant to the stackmap would fail because we +; expected to be able to address off the frame pointer. In a realigned +; frame, we must use the stack pointer instead. This is a separate bug. + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0) + ret void +} +declare void @escape_values(...) + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll index 7557f25..e3cc2fa 100644 --- a/test/CodeGen/X86/store-narrow.ll +++ b/test/CodeGen/X86/store-narrow.ll @@ -34,8 +34,8 @@ entry: ; X64: movb %sil, 1(%rdi) ; X32-LABEL: test2: -; X32: movb 8(%esp), %[[REG:[abcd]l]] -; X32: movb %[[REG]], 1(%{{.*}}) +; X32: movb 8(%esp), %[[REG:[abcd]]]l +; X32: movb %[[REG]]l, 1(%{{.*}}) } define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp { @@ -67,8 +67,8 @@ entry: ; X64: movw %si, 2(%rdi) ; X32-LABEL: test4: -; X32: movl 8(%esp), %e[[REG:[abcd]x]] -; X32: movw %[[REG]], 2(%{{.*}}) +; X32: movw 8(%esp), %[[REG:[abcd]]]x +; X32: movw %[[REG]]x, 2(%{{.*}}) } define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp { @@ -84,8 +84,8 @@ entry: ; X64: movw %si, 2(%rdi) ; X32-LABEL: test5: -; X32: movzwl 8(%esp), %e[[REG:[abcd]x]] -; X32: movw %[[REG]], 2(%{{.*}}) +; X32: movw 8(%esp), %[[REG:[abcd]]]x +; X32: movw %[[REG]]x, 2(%{{.*}}) } define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp { diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll index 4b1f903..697af84 100644 --- a/test/CodeGen/X86/swizzle-2.ll +++ b/test/CodeGen/X86/swizzle-2.ll @@ -8,508 +8,433 @@ ; illegal shuffle that is expanded into a sub-optimal sequence of instructions ; during lowering stage. - define <4 x i32> @swizzle_1(<4 x i32> %v) { +; CHECK-LABEL: swizzle_1: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_1 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_2(<4 x i32> %v) { +; CHECK-LABEL: swizzle_2: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_2 -; Mask: [2,1,3,0] -; CHECK: pshufd $54 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_3(<4 x i32> %v) { +; CHECK-LABEL: swizzle_3: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_3 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_4(<4 x i32> %v) { +; CHECK-LABEL: swizzle_4: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_4 -; Mask: [3,1,0,2] -; CHECK: pshufd $-121 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_5(<4 x i32> %v) { +; CHECK-LABEL: swizzle_5: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_5 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_6(<4 x i32> %v) { +; CHECK-LABEL: swizzle_6: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_6 -; Mask: [2,0,1,3] -; CHECK: pshufd $-46 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_7(<4 x i32> %v) { +; CHECK-LABEL: swizzle_7: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_7 -; Mask: [0,2,3,1] -; CHECK: pshufd $120 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_8(<4 x i32> %v) { +; CHECK-LABEL: swizzle_8: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_8 -; Mask: [1,3,2,0] -; CHECK: pshufd $45 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_9(<4 x i32> %v) { +; CHECK-LABEL: swizzle_9: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_9 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_10(<4 x i32> %v) { +; CHECK-LABEL: swizzle_10: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_10 -; Mask: [1,2,0,3] -; CHECK: pshufd $-55 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_11(<4 x i32> %v) { +; CHECK-LABEL: swizzle_11: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_11 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_12(<4 x i32> %v) { +; CHECK-LABEL: swizzle_12: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_12 -; Mask: [0,3,1,2] -; CHECK: pshufd $-100 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_13(<4 x i32> %v) { +; CHECK-LABEL: swizzle_13: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_13 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_14(<4 x i32> %v) { +; CHECK-LABEL: swizzle_14: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_14 -; Mask: [3,0,2,1] -; CHECK: pshufd $99 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_15(<4 x float> %v) { +; CHECK-LABEL: swizzle_15: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_15 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_16(<4 x float> %v) { +; CHECK-LABEL: swizzle_16: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_16 -; Mask: [2,1,3,0] -; CHECK: pshufd $54 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_17(<4 x float> %v) { +; CHECK-LABEL: swizzle_17: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_17 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_18(<4 x float> %v) { +; CHECK-LABEL: swizzle_18: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_18 -; Mask: [3,1,0,2] -; CHECK: pshufd $-121 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_19(<4 x float> %v) { +; CHECK-LABEL: swizzle_19: +; CHECK: # BB#0: +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_19 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_20(<4 x float> %v) { +; CHECK-LABEL: swizzle_20: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_20 -; Mask: [2,0,1,3] -; CHECK: pshufd $-46 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_21(<4 x float> %v) { +; CHECK-LABEL: swizzle_21: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_21 -; Mask: [0,2,3,1] -; CHECK: pshufd $120 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_22(<4 x float> %v) { +; CHECK-LABEL: swizzle_22: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_22 -; Mask: [1,3,2,0] -; CHECK: pshufd $45 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_23(<4 x float> %v) { +; CHECK-LABEL: swizzle_23: +; CHECK: # BB#0: +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_23 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_24(<4 x float> %v) { +; CHECK-LABEL: swizzle_24: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,0,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_24 -; Mask: [1,2,0,3] -; CHECK: pshufd $-55 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_25(<4 x float> %v) { +; CHECK-LABEL: swizzle_25: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_25 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_26(<4 x float> %v) { +; CHECK-LABEL: swizzle_26: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,1,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_26 -; Mask: [0,3,1,2] -; CHECK: pshufd $-100 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_27(<4 x float> %v) { +; CHECK-LABEL: swizzle_27: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_27 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_28(<4 x float> %v) { +; CHECK-LABEL: swizzle_28: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,2,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_28 -; Mask: [3,0,2,1] -; CHECK: pshufd $99 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_29(<4 x float> %v) { +; CHECK-LABEL: swizzle_29: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_29 -; Mask: [1,3,2,0] -; CHECK: pshufd $45 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret ; Make sure that we combine the shuffles from each function below into a single ; legal shuffle (either pshuflw or pshufb depending on the masks). define <8 x i16> @swizzle_30(<8 x i16> %v) { +; CHECK-LABEL: swizzle_30: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_30 -; Mask: [1,3,2,0,5,7,6,4] -; CHECK: pshuflw $45 -; CHECK-NOT: pshufb -; CHECK-NEXT: ret - define <8 x i16> @swizzle_31(<8 x i16> %v) { +; CHECK-LABEL: swizzle_31: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_31 -; Mask: [1,3,2,0,4,5,6,7] -; CHECK: pshuflw $45 -; CHECK-NOT: pshufb -; CHECK: ret - define <8 x i16> @swizzle_32(<8 x i16> %v) { +; CHECK-LABEL: swizzle_32: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_32 -; Mask: [2,3,0,1,4,5,6,7] --> equivalent to pshufd mask [1,0,2,3] -; CHECK: pshufd $-31 -; CHECK-NOT: pshufb -; CHECK: ret define <8 x i16> @swizzle_33(<8 x i16> %v) { +; CHECK-LABEL: swizzle_33: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_33 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_34(<8 x i16> %v) { +; CHECK-LABEL: swizzle_34: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_34 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_35(<8 x i16> %v) { +; CHECK-LABEL: swizzle_35: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_35 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK: ret - define <8 x i16> @swizzle_36(<8 x i16> %v) { +; CHECK-LABEL: swizzle_36: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_36 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_37(<8 x i16> %v) { +; CHECK-LABEL: swizzle_37: +; CHECK: # BB#0: +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 5> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_37 -; Mask: [0,1,2,3,4,7,6,5] -; CHECK: pshufhw $108 -; CHECK-NOT: pshufb -; CHECK: ret - define <8 x i16> @swizzle_38(<8 x i16> %v) { +; CHECK-LABEL: swizzle_38: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_38 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_39(<8 x i16> %v) { +; CHECK-LABEL: swizzle_39: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_39 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_40(<8 x i16> %v) { +; CHECK-LABEL: swizzle_40: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_40 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_41(<8 x i16> %v) { +; CHECK-LABEL: swizzle_41: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_41 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_42(<8 x i16> %v) { +; CHECK-LABEL: swizzle_42: +; CHECK: # BB#0: +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_42 -; Mask: [0,1,2,3,5,4,7,6] -; CHECK: pshufhw $-79 -; CHECK-NOT: pshufb -; CHECK: ret - - diff --git a/test/CodeGen/X86/swizzle.ll b/test/CodeGen/X86/swizzle.ll deleted file mode 100644 index 23e0c24..0000000 --- a/test/CodeGen/X86/swizzle.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movlps -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movups -; rdar://6523650 - - %struct.vector4_t = type { <4 x float> } - -define void @swizzle(i8* nocapture %a, %struct.vector4_t* nocapture %b, %struct.vector4_t* nocapture %c) nounwind { -entry: - %0 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %1 = load <4 x float>* %0, align 4 ; <<4 x float>> [#uses=1] - %tmp.i = bitcast i8* %a to double* ; <double*> [#uses=1] - %tmp1.i = load double* %tmp.i ; <double> [#uses=1] - %2 = insertelement <2 x double> undef, double %tmp1.i, i32 0 ; <<2 x double>> [#uses=1] - %tmp2.i = bitcast <2 x double> %2 to <4 x float> ; <<4 x float>> [#uses=1] - %3 = shufflevector <4 x float> %1, <4 x float> %tmp2.i, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %3, <4 x float>* %0, align 4 - ret void -} diff --git a/test/CodeGen/X86/tailcall-multiret.ll b/test/CodeGen/X86/tailcall-multiret.ll new file mode 100644 index 0000000..a77a59c --- /dev/null +++ b/test/CodeGen/X86/tailcall-multiret.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core2 | FileCheck %s +; See PR19530 +declare double @llvm.powi.f64(double %Val, i32 %power) +define <3 x double> @julia_foo17589(i32 %arg) { + %tmp1 = call double @llvm.powi.f64(double 1.000000e+00, i32 %arg) +; CHECK: callq __powidf2 + %tmp2 = insertelement <3 x double> undef, double %tmp1, i32 0 + %tmp3 = call double @llvm.powi.f64(double 2.000000e+00, i32 %arg) +; CHECK: callq __powidf2 + %tmp4 = insertelement <3 x double> %tmp2, double %tmp3, i32 1 + %tmp5 = call double @llvm.powi.f64(double 3.000000e+00, i32 %arg) +; CHECK: callq __powidf2 + %tmp6 = insertelement <3 x double> %tmp4, double %tmp5, i32 2 +; CHECK-NOT: TAILCALL + ret <3 x double> %tmp6 +} diff --git a/test/CodeGen/X86/tls-addr-non-leaf-function.ll b/test/CodeGen/X86/tls-addr-non-leaf-function.ll new file mode 100644 index 0000000..ec47232 --- /dev/null +++ b/test/CodeGen/X86/tls-addr-non-leaf-function.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -relocation-model=pic -O2 -disable-fp-elim -o - | FileCheck %s +; RUN: llc < %s -relocation-model=pic -O2 -o - | FileCheck %s + +; This test runs twice with different options regarding the frame pointer: +; first the elimination is disabled, then it is enabled. The disabled case is +; the "control group". +; The function 'foo' below is marked with the "no-frame-pointer-elim-non-leaf" +; attribute which dictates that the frame pointer should not be eliminated +; unless the function is a leaf (i.e. it doesn't call any other function). +; Now, 'foo' is not a leaf function, because it performs a TLS access which on +; X86 ELF in PIC mode is expanded as a library call. +; This call is represented with a pseudo-instruction which doesn't appear to be +; a call when inspected by the analysis passes (it doesn't have the "isCall" +; flag), and the ISel lowering code creating the pseudo was not informing the +; MachineFrameInfo that the function contained calls. This affected the decision +; whether to eliminate the frame pointer. +; With the fix, the "hasCalls" flag is set in the MFI for the function whenever +; a TLS access pseudo-instruction is created, so 'foo' appears to be a non-leaf +; function, and the difference in the options does not affect codegen: both +; versions will have a frame pointer. + +; Test that there's some frame pointer usage in 'foo'... +; CHECK: foo: +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; ... and the TLS library call is also present. +; CHECK: leaq x@TLSGD(%rip), %rdi +; CHECK: callq __tls_get_addr@PLT + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@x = thread_local global i32 0 +define i32 @foo() "no-frame-pointer-elim-non-leaf" { + %a = load i32* @x, align 4 + ret i32 %a +} diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll index d230f1f..8de6297 100644 --- a/test/CodeGen/X86/trunc-ext-ld-st.ll +++ b/test/CodeGen/X86/trunc-ext-ld-st.ll @@ -20,7 +20,7 @@ define void @load_2_i8(<2 x i8>* %A) { ; Read 32-bits ;CHECK: pmovzxwq ;CHECK: paddq -;CHECK: pshufb +;CHECK: pshufd ;CHECK: movd ;CHECK: ret define void @load_2_i16(<2 x i16>* %A) { @@ -32,7 +32,7 @@ define void @load_2_i16(<2 x i16>* %A) { ;CHECK-LABEL: load_2_i32: ;CHECK: pmovzxdq -;CHECK: paddq +;CHECK: paddd ;CHECK: pshufd ;CHECK: ret define void @load_2_i32(<2 x i32>* %A) { @@ -56,7 +56,7 @@ define void @load_4_i8(<4 x i8>* %A) { ;CHECK-LABEL: load_4_i16: ;CHECK: pmovzxwd -;CHECK: paddd +;CHECK: paddw ;CHECK: pshufb ;CHECK: ret define void @load_4_i16(<4 x i16>* %A) { @@ -68,7 +68,7 @@ define void @load_4_i16(<4 x i16>* %A) { ;CHECK-LABEL: load_8_i8: ;CHECK: pmovzxbw -;CHECK: paddw +;CHECK: paddb ;CHECK: pshufb ;CHECK: ret define void @load_8_i8(<8 x i8>* %A) { diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll index c5a61c3..e47f154 100644 --- a/test/CodeGen/X86/uint_to_fp-2.ll +++ b/test/CodeGen/X86/uint_to_fp-2.ll @@ -1,15 +1,20 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s ; rdar://6504833 define float @test1(i32 %x) nounwind readnone { -; CHECK: test1 -; CHECK: movd -; CHECK: orps -; CHECK: subsd -; CHECK: cvtsd2ss -; CHECK: movss -; CHECK: flds -; CHECK: ret +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movsd .LCPI0_0, %xmm0 +; CHECK-NEXT: movd {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: orps %xmm0, %xmm1 +; CHECK-NEXT: subsd %xmm0, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl entry: %0 = uitofp i32 %x to float ret float %0 @@ -17,15 +22,20 @@ entry: ; PR10802 define float @test2(<4 x i32> %x) nounwind readnone ssp { -; CHECK: test2 -; CHECK: xorps [[ZERO:%xmm[0-9]+]] -; CHECK: movss {{.*}}, [[ZERO]] -; CHECK: orps -; CHECK: subsd -; CHECK: cvtsd2ss -; CHECK: movss -; CHECK: flds -; CHECK: ret +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss %xmm0, %xmm1 +; CHECK-NEXT: movsd .LCPI1_0, %xmm0 +; CHECK-NEXT: orps %xmm0, %xmm1 +; CHECK-NEXT: subsd %xmm0, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl entry: %vecext = extractelement <4 x i32> %x, i32 0 %conv = uitofp i32 %vecext to float diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll index d7ae469..ca9ea4a 100644 --- a/test/CodeGen/X86/unknown-location.ll +++ b/test/CodeGen/X86/unknown-location.ll @@ -21,16 +21,16 @@ entry: !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!12} -!0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!0 = metadata !{metadata !"0x101\00x\001\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ] +!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", metadata !10, metadata !2, metadata !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] +!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ] +!3 = metadata !{metadata !"0x11\0012\00producer\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ] +!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] !5 = metadata !{metadata !6} -!6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ] +!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ] +!7 = metadata !{metadata !"0xb\001\0030\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ] !8 = metadata !{i32 4, i32 3, metadata !7, null} !9 = metadata !{metadata !1} !10 = metadata !{metadata !"test.c", metadata !"/dir"} !11 = metadata !{i32 0} -!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll deleted file mode 100644 index fca4da6..0000000 --- a/test/CodeGen/X86/v-binop-widen.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s -; CHECK: divps -; CHECK: divps -; CHECK: divss - -%vec = type <9 x float> -define %vec @vecdiv( %vec %p1, %vec %p2) -{ - %result = fdiv %vec %p1, %p2 - ret %vec %result -} diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll deleted file mode 100644 index 3342111..0000000 --- a/test/CodeGen/X86/v-binop-widen2.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s -; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s - -%vec = type <6 x float> -; CHECK: divps -; CHECK: divss -; CHECK: divss - -; Scheduler causes a different instruction order to be produced on Intel Atom -; ATOM: divps -; ATOM: divss -; ATOM: divss - -define %vec @vecdiv( %vec %p1, %vec %p2) -{ - %result = fdiv %vec %p1, %p2 - ret %vec %result -} - -@a = constant %vec < float 2.0, float 4.0, float 8.0, float 16.0, float 32.0, float 64.0 > -@b = constant %vec < float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0 > - -; Expected result: < 1.0, 2.0, 4.0, ..., 2.0^(n-1) > -; main() returns 0 if the result is expected and 1 otherwise -; to execute, use llvm-as < %s | lli -define i32 @main() nounwind { -entry: - %avec = load %vec* @a - %bvec = load %vec* @b - - %res = call %vec @vecdiv(%vec %avec, %vec %bvec) - br label %loop -loop: - %idx = phi i32 [0, %entry], [%nextInd, %looptail] - %expected = phi float [1.0, %entry], [%nextExpected, %looptail] - %elem = extractelement %vec %res, i32 %idx - %expcmp = fcmp oeq float %elem, %expected - br i1 %expcmp, label %looptail, label %return -looptail: - %nextExpected = fmul float %expected, 2.0 - %nextInd = add i32 %idx, 1 - %cmp = icmp slt i32 %nextInd, 6 - br i1 %cmp, label %loop, label %return -return: - %retval = phi i32 [0, %looptail], [1, %loop] - ret i32 %retval -} diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll index dab5e7b..b9bd80f9 100644 --- a/test/CodeGen/X86/v2f32.ll +++ b/test/CodeGen/X86/v2f32.ll @@ -1,115 +1,94 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=W64 -; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -asm-verbose=0 -o - | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32 ; PR7518 define void @test1(<2 x float> %Q, float *%P2) nounwind { +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: addss %xmm0, %xmm1 +; X64-NEXT: movss %xmm1, (%rdi) +; X64-NEXT: retq +; +; X32-LABEL: test1: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps %xmm0, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: addss %xmm0, %xmm1 +; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: retl %a = extractelement <2 x float> %Q, i32 0 %b = extractelement <2 x float> %Q, i32 1 %c = fadd float %a, %b - store float %c, float* %P2 ret void -; X64-LABEL: test1: -; X64-NEXT: pshufd $1, %xmm0, %xmm1 -; X64-NEXT: addss %xmm0, %xmm1 -; X64-NEXT: movss %xmm1, (%rdi) -; X64-NEXT: ret - -; W64-LABEL: test1: -; W64-NEXT: movdqa (%rcx), %xmm0 -; W64-NEXT: pshufd $1, %xmm0, %xmm1 -; W64-NEXT: addss %xmm0, %xmm1 -; W64-NEXT: movss %xmm1, (%rdx) -; W64-NEXT: ret - -; X32-LABEL: test1: -; X32-NEXT: movl 4(%esp), %eax -; X32-NEXT: pshufd $1, %xmm0, %xmm1 -; X32-NEXT: addss %xmm0, %xmm1 -; X32-NEXT: movss %xmm1, (%eax) -; X32-NEXT: ret } - define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounwind { - %Z = fadd <2 x float> %Q, %R - ret <2 x float> %Z - ; X64-LABEL: test2: -; X64-NEXT: addps %xmm1, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test2: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps (%rdx), %xmm0 -; W64-NEXT: ret - +; X64: # BB#0: +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: retq +; ; X32-LABEL: test2: -; X32: addps %xmm1, %xmm0 +; X32: # BB#0: +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: retl + %Z = fadd <2 x float> %Q, %R + ret <2 x float> %Z } - define <2 x float> @test3(<4 x float> %A) nounwind { +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test3: +; X32: # BB#0: +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: retl %B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1> %C = fadd <2 x float> %B, %B ret <2 x float> %C -; X64-LABEL: test3: -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test3: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: ret - -; X32-LABEL: test3: -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: ret } define <2 x float> @test4(<2 x float> %A) nounwind { - %C = fadd <2 x float> %A, %A - ret <2 x float> %C ; X64-LABEL: test4: -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test4: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: ret - +; X64: # BB#0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: retq +; ; X32-LABEL: test4: -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: ret +; X32: # BB#0: +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: retl + %C = fadd <2 x float> %A, %A + ret <2 x float> %C } define <4 x float> @test5(<4 x float> %A) nounwind { +; X64-LABEL: test5: +; X64: # BB#0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test5: +; X32: # BB#0: +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: retl %B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1> %C = fadd <2 x float> %B, %B - br label %BB - + br label %BB + BB: - %D = fadd <2 x float> %C, %C + %D = fadd <2 x float> %C, %C %E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> ret <4 x float> %E - -; X64-LABEL: test5: -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test5: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: ret - -; X32-LABEL: test5: -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: ret } diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll new file mode 100644 index 0000000..2dcf319 --- /dev/null +++ b/test/CodeGen/X86/vararg-callee-cleanup.ll @@ -0,0 +1,54 @@ +; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" + +declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...) +define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) { + call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2) + call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2) + %t1 = add i32 %b, %c + %r = add i32 %t1, %d + ret i32 %r +} + +; CHECK: _call_varargs_thiscall_thunk: +; CHECK: calll _thiscall_thunk +; CHECK-NEXT: subl $8, %esp + +; We don't mangle the argument size into variadic callee cleanup functions. + +declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...) +define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) { + call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2) + call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2) + %t1 = add i32 %b, %c + %r = add i32 %t1, %d + ret i32 %r +} + +; CHECK: _call_varargs_stdcall_thunk: +; CHECK: calll _stdcall_thunk{{$}} +; CHECK-NEXT: subl $12, %esp + +declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...) +define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) { + call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2) + call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2) + %t1 = add i32 %b, %c + %r = add i32 %t1, %d + ret i32 %r +} + +; CHECK: _call_varargs_fastcall_thunk: +; CHECK: calll @fastcall_thunk{{$}} +; CHECK-NEXT: subl $4, %esp + +; If you actually return from such a thunk, it will only pop the non-variadic +; portion of the arguments, which is different from what the callee passes. + +define x86_stdcallcc void @varargs_stdcall_return(i32, i32, ...) { + ret void +} + +; CHECK: _varargs_stdcall_return: +; CHECK: retl $8 diff --git a/test/CodeGen/X86/vararg_no_start.ll b/test/CodeGen/X86/vararg_no_start.ll new file mode 100644 index 0000000..ab5c6fc --- /dev/null +++ b/test/CodeGen/X86/vararg_no_start.ll @@ -0,0 +1,9 @@ +; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s + +define void @foo(i8*, ...) { + ret void +} +; CHECK-LABEL: {{^_?}}foo: +; CHECK-NOT: movq +; CHECK: retq diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll index 6017753..d0c5150 100644 --- a/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/test/CodeGen/X86/vastart-defs-eflags.ll @@ -14,6 +14,7 @@ entry: br i1 %tobool, label %if.end, label %if.then if.then: ; preds = %entry + call void @llvm.va_start(i8* null) br label %if.end if.end: ; preds = %entry, %if.then @@ -21,3 +22,4 @@ if.end: ; preds = %entry, %if.then ret i32 %hasflag } +declare void @llvm.va_start(i8*) nounwind diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index 1a6c05d..8600c48 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -1,75 +1,177 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE -;CHECK-LABEL: foo1_8: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo1_8: -;CHECK-WIDE: vpmovzxbd %xmm0, %xmm1 -;CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1 -;CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1 -;CHECK-WIDE-NEXT: vpshufb {{.*}}, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -;CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 -;CHECK-WIDE-NEXT: ret define <8 x float> @foo1_8(<8 x i8> %src) { +; CHECK-LABEL: foo1_8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $24, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo1_8: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } -;CHECK-LABEL: foo1_4: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo1_4: -;CHECK-WIDE: vpmovzxbd %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -;CHECK-WIDE-NEXT: ret define <4 x float> @foo1_4(<4 x i8> %src) { +; CHECK-LABEL: foo1_4: +; CHECK: ## BB#0: +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo1_4: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } -;CHECK-LABEL: foo2_8: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo2_8: -;CHECK-WIDE: vcvtdq2ps %ymm{{.*}}, %ymm{{.*}} -;CHECK-WIDE: ret define <8 x float> @foo2_8(<8 x i8> %src) { +; CHECK-LABEL: foo2_8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vandps LCPI2_0, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo2_8: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } -;CHECK-LABEL: foo2_4: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo2_4: -;CHECK-WIDE: vcvtdq2ps %xmm{{.*}}, %xmm{{.*}} -;CHECK-WIDE: ret define <4 x float> @foo2_4(<4 x i8> %src) { +; CHECK-LABEL: foo2_4: +; CHECK: ## BB#0: +; CHECK-NEXT: vandps LCPI3_0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo2_4: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } -;CHECK-LABEL: foo3_8: -;CHECK: vcvttps2dq -;CHECK: ret define <8 x i8> @foo3_8(<8 x float> %src) { +; CHECK-LABEL: foo3_8: +; CHECK: ## BB#0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo3_8: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx +; CHECK-WIDE-NEXT: movzbl %dl, %edx +; CHECK-WIDE-NEXT: orl %eax, %edx +; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> ret <8 x i8> %res } -;CHECK-LABEL: foo3_4: -;CHECK: vcvttps2dq -;CHECK: ret + define <4 x i8> @foo3_4(<4 x float> %src) { +; CHECK-LABEL: foo3_4: +; CHECK: ## BB#0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo3_4: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx +; CHECK-WIDE-NEXT: movzbl %dl, %edx +; CHECK-WIDE-NEXT: orl %eax, %edx +; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res } diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll deleted file mode 100644 index 4da7953..0000000 --- a/test/CodeGen/X86/vec_compare-2.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc < %s -mtriple=i686-linux -mcpu=penryn | FileCheck %s - -declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone - -declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone - -define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) { -entry: -; CHECK: cfi_def_cfa_offset -; CHECK-NOT: set -; CHECK: pmovzxwq -; CHECK: pshufb - %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1] - %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1] - %sub322.i = sub <4 x i32> %shr.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %cmp323.x = icmp slt <4 x i32> zeroinitializer, %sub322.i ; <<4 x i1>> [#uses=1] - %cmp323.i = sext <4 x i1> %cmp323.x to <4 x i32> ; <<4 x i32>> [#uses=1] - %or.i = or <4 x i32> %cmp318.i, %cmp323.i ; <<4 x i32>> [#uses=1] - %tmp10.i83.i = bitcast <4 x i32> %or.i to <4 x float> ; <<4 x float>> [#uses=1] - %0 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> undef, <4 x float> undef, <4 x float> %tmp10.i83.i) nounwind ; <<4 x float>> [#uses=1] - %conv.i.i15.i = bitcast <4 x float> %0 to <4 x i32> ; <<4 x i32>> [#uses=1] - %swz.i.i28.i = shufflevector <4 x i32> %conv.i.i15.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i32>> [#uses=1] - %tmp6.i29.i = bitcast <2 x i32> %swz.i.i28.i to <4 x i16> ; <<4 x i16>> [#uses=1] - %swz.i30.i = shufflevector <4 x i16> %tmp6.i29.i, <4 x i16> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i16>> [#uses=1] - store <2 x i16> %swz.i30.i, <2 x i16>* undef - unreachable - ret void -} diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll index 0aa72b1..318aca1 100644 --- a/test/CodeGen/X86/vec_ctbits.ll +++ b/test/CodeGen/X86/vec_ctbits.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll index 3cb519a..530911a 100644 --- a/test/CodeGen/X86/vec_extract-sse4.ll +++ b/test/CodeGen/X86/vec_extract-sse4.ll @@ -1,10 +1,14 @@ -; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 -o %t -; RUN: not grep extractps %t -; RUN: not grep pextrd %t -; RUN: not grep pshufd %t -; RUN: not grep movss %t +; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s define void @t1(float* %R, <4 x float>* %P1) nounwind { +; CHECK-LABEL: t1: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movss 12(%ecx), %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl + %X = load <4 x float>* %P1 %tmp = extractelement <4 x float> %X, i32 3 store float %tmp, float* %R @@ -12,12 +16,31 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind { } define float @t2(<4 x float>* %P1) nounwind { +; CHECK-LABEL: t2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movapd (%eax), %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl + %X = load <4 x float>* %P1 %tmp = extractelement <4 x float> %X, i32 2 ret float %tmp } define void @t3(i32* %R, <4 x i32>* %P1) nounwind { +; CHECK-LABEL: t3: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl 12(%ecx), %ecx +; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: retl + %X = load <4 x i32>* %P1 %tmp = extractelement <4 x i32> %X, i32 3 store i32 %tmp, i32* %R @@ -25,6 +48,12 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind { } define i32 @t4(<4 x i32>* %P1) nounwind { +; CHECK-LABEL: t4: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl 12(%eax), %eax +; CHECK-NEXT: retl + %X = load <4 x i32>* %P1 %tmp = extractelement <4 x i32> %X, i32 3 ret i32 %tmp diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll index 88f5a58..6df7be7 100644 --- a/test/CodeGen/X86/vec_extract.ll +++ b/test/CodeGen/X86/vec_extract.ll @@ -1,10 +1,17 @@ -; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t -; RUN: grep movss %t | count 4 -; RUN: grep movhlps %t | count 1 -; RUN: not grep pshufd %t -; RUN: grep unpckhpd %t | count 1 +; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" define void @test1(<4 x float>* %F, float* %f) nounwind { +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm0 +; CHECK-NEXT: addps %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] %tmp2 = extractelement <4 x float> %tmp7, i32 0 ; <float> [#uses=1] @@ -13,6 +20,18 @@ define void @test1(<4 x float>* %F, float* %f) nounwind { } define float @test2(<4 x float>* %F, float* %f) nounwind { +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: addps %xmm0, %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +entry: %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] %tmp2 = extractelement <4 x float> %tmp7, i32 2 ; <float> [#uses=1] @@ -20,6 +39,14 @@ define float @test2(<4 x float>* %F, float* %f) nounwind { } define void @test3(float* %R, <4 x float>* %P1) nounwind { +; CHECK-LABEL: test3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movss 12(%ecx), %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %X = load <4 x float>* %P1 ; <<4 x float>> [#uses=1] %tmp = extractelement <4 x float> %X, i32 3 ; <float> [#uses=1] store float %tmp, float* %R @@ -27,6 +54,17 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind { } define double @test4(double %A) nounwind { +; CHECK-LABEL: test4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: calll foo +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movsd %xmm0, (%esp) +; CHECK-NEXT: fldl (%esp) +; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: retl +entry: %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1] %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; <double> [#uses=1] %tmp3 = fadd double %tmp2, %A ; <double> [#uses=1] diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll index 82517cb..ac02acf 100644 --- a/test/CodeGen/X86/vec_fabs.ll +++ b/test/CodeGen/X86/vec_fabs.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s define <2 x double> @fabs_v2f64(<2 x double> %p) { - ; CHECK: fabs_v2f64 + ; CHECK-LABEL: fabs_v2f64 ; CHECK: vandps %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p) ret <2 x double> %t @@ -12,7 +12,7 @@ declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p) define <4 x float> @fabs_v4f32(<4 x float> %p) { - ; CHECK: fabs_v4f32 + ; CHECK-LABEL: fabs_v4f32 ; CHECK: vandps %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p) ret <4 x float> %t @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p) define <4 x double> @fabs_v4f64(<4 x double> %p) { - ; CHECK: fabs_v4f64 + ; CHECK-LABEL: fabs_v4f64 ; CHECK: vandps %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p) ret <4 x double> %t @@ -30,9 +30,46 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p) define <8 x float> @fabs_v8f32(<8 x float> %p) { - ; CHECK: fabs_v8f32 + ; CHECK-LABEL: fabs_v8f32 ; CHECK: vandps %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p) ret <8 x float> %t } declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p) + +; PR20354: when generating code for a vector fabs op, +; make sure that we're only turning off the sign bit of each float value. +; No constant pool loads or vector ops are needed for the fabs of a +; bitcasted integer constant; we should just return an integer constant +; that has the sign bits turned off. +; +; So instead of something like this: +; movabsq (constant pool load of mask for sign bits) +; vmovq (move from integer register to vector/fp register) +; vandps (mask off sign bits) +; vmovq (move vector/fp register back to integer return register) +; +; We should generate: +; mov (put constant value in return register) + +define i64 @fabs_v2f32_1() { +; CHECK-LABEL: fabs_v2f32_1: +; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000 +; CHECK-NEXT: retq + %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000 + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast) + %ret = bitcast <2 x float> %fabs to i64 + ret i64 %ret +} + +define i64 @fabs_v2f32_2() { +; CHECK-LABEL: fabs_v2f32_2: +; CHECK: movl $2147483647, %eax # imm = 0x7FFFFFFF +; CHECK-NEXT: retq + %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast) + %ret = bitcast <2 x float> %fabs to i64 + ret i64 %ret +} + +declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p) diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll index d49c70e..9743f71 100644 --- a/test/CodeGen/X86/vec_fneg.ll +++ b/test/CodeGen/X86/vec_fneg.ll @@ -1,11 +1,45 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s +; FNEG is defined as subtraction from -0.0. + +; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed. define <4 x float> @t1(<4 x float> %Q) { - %tmp15 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q - ret <4 x float> %tmp15 +; CHECK-LABEL: t1: +; CHECK: xorps {{.*}}LCPI0_0{{.*}}, %xmm0 +; CHECK-NEXT: retq + %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q + ret <4 x float> %tmp } +; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg. define <4 x float> @t2(<4 x float> %Q) { - %tmp15 = fsub <4 x float> zeroinitializer, %Q - ret <4 x float> %tmp15 +; CHECK-LABEL: t2: +; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]] +; CHECK-NEXT: subps %xmm0, %[[X]] +; CHECK-NEXT: movaps %[[X]], %xmm0 +; CHECK-NEXT: retq + %tmp = fsub <4 x float> zeroinitializer, %Q + ret <4 x float> %tmp +} + +; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely. +; Make sure that we're flipping the sign bit and only the sign bit of each float. +; So instead of something like this: +; movd %rdi, %xmm0 +; xorps .LCPI2_0(%rip), %xmm0 +; +; We should generate: +; movabsq (put sign bit mask in integer register)) +; xorq (flip sign bits) +; movd (move to xmm return register) + +define <2 x float> @fneg_bitcast(i64 %i) { +; CHECK-LABEL: fneg_bitcast: +; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000 +; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: movd %rax, %xmm0 +; CHECK-NEXT: retq + %bitcast = bitcast i64 %i to <2 x float> + %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast + ret <2 x float> %fneg } diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll index 7ec07ae..b882a5e 100644 --- a/test/CodeGen/X86/vec_fpext.ll +++ b/test/CodeGen/X86/vec_fpext.ll @@ -3,6 +3,8 @@ ; PR11674 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) { +; CHECK-LABEL: fpext_frommem: +; AVX-LABEL: fpext_frommem: entry: ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}} ; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}} @@ -13,6 +15,8 @@ entry: } define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) { +; CHECK-LABEL: fpext_frommem4: +; AVX-LABEL: fpext_frommem4: entry: ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}} ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}} @@ -24,6 +28,8 @@ entry: } define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) { +; CHECK-LABEL: fpext_frommem8: +; AVX-LABEL: fpext_frommem8: entry: ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}} ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}} diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 5cb9f69..b72044a 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -2,66 +2,87 @@ ; There are no MMX operations in @t1 define void @t1(i32 %a, x86_mmx* %P) nounwind { - %tmp12 = shl i32 %a, 12 - %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 - %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 - %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx - store x86_mmx %tmp23, x86_mmx* %P - ret void - ; CHECK-LABEL: t1: -; CHECK-NOT: %mm -; CHECK: shll $12 -; CHECK-NOT: %mm +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $12, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] +; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp12 = shl i32 %a, 12 + %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 + %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 + %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx + store x86_mmx %tmp23, x86_mmx* %P + ret void } define <4 x float> @t2(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > - ret <4 x float> %tmp2 - ; CHECK-LABEL: t2: -; CHECK: pslldq $12 +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > + ret <4 x float> %tmp2 } define <4 x float> @t3(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > - ret <4 x float> %tmp2 - ; CHECK-LABEL: t3: -; CHECK: psrldq $8 +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0] +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > + ret <4 x float> %tmp2 } define <4 x float> @t4(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > - ret <4 x float> %tmp2 - ; CHECK-LABEL: t4: -; CHECK: psrldq $12 +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > + ret <4 x float> %tmp2 } define <16 x i8> @t5(<16 x i8> %x) nounwind { - %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17> - ret <16 x i8> %s - ; CHECK-LABEL: t5: -; CHECK: psrldq $1 +; CHECK: # BB#0: +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17> + ret <16 x i8> %s } define <16 x i8> @t6(<16 x i8> %x) nounwind { - %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i8> %s - ; CHECK-LABEL: t6: -; CHECK: palignr $1 +; CHECK: # BB#0: +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s } define <16 x i8> @t7(<16 x i8> %x) nounwind { - %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2> - ret <16 x i8> %s - ; CHECK-LABEL: t7: -; CHECK: pslldq $13 +; CHECK: # BB#0: +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2> + ret <16 x i8> %s } diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll deleted file mode 100644 index 4583e19..0000000 --- a/test/CodeGen/X86/vec_insert-6.ll +++ /dev/null @@ -1,9 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6 - -define <4 x float> @t3(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > - ret <4 x float> %tmp2 -} diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll deleted file mode 100644 index 0ed8f10..0000000 --- a/test/CodeGen/X86/vec_insert.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1 -; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw - -define void @test(<4 x float>* %F, i32 %I) nounwind { - %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=1] - %f = sitofp i32 %I to float ; <float> [#uses=1] - %tmp1 = insertelement <4 x float> %tmp, float %f, i32 0 ; <<4 x float>> [#uses=2] - %tmp18 = fadd <4 x float> %tmp1, %tmp1 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp18, <4 x float>* %F - ret void -} - -define void @test2(<4 x float>* %F, i32 %I, float %g) nounwind { - %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=1] - %f = sitofp i32 %I to float ; <float> [#uses=1] - %tmp1 = insertelement <4 x float> %tmp, float %f, i32 2 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp1, <4 x float>* %F - ret void -} diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll index 2cf5dc6..f7fcd03 100644 --- a/test/CodeGen/X86/vec_return.ll +++ b/test/CodeGen/X86/vec_return.ll @@ -10,7 +10,7 @@ define <2 x double> @test() { ; Prefer a constant pool load here. ; CHECK: test2 ; CHECK-NOT: shuf -; CHECK: movaps {{.*}}CPI +; CHECK: movaps {{.*}}{{CPI|__xmm@}} define <4 x i32> @test2() nounwind { ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 > } diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index d1d7608..a13c813 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,17 +1,37 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t -; RUN: grep pshufd %t | count 2 +; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s -define <4 x float> @test(float %a) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1] - %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp6 +define <4 x float> @test(float %a) { +; CHECK-LABEL: test: +; CHECK: insertps $29, {{.*}}, %xmm0 +; CHECK-NEXT: retl + +entry: + %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 + %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 + ret <4 x float> %tmp6 } -define <2 x i64> @test2(i32 %a) nounwind { - %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 ; <<4 x i32>> [#uses=1] - %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 ; <<4 x i32>> [#uses=1] - %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp10 +define <2 x i64> @test2(i32 %a) { +; CHECK-LABEL: test2: +; CHECK: movd {{.*}}, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; CHECK-NEXT: retl + +entry: + %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 + %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 + %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> + ret <2 x i64> %tmp10 } +define <4 x float> @test3(<4 x float> %A) { +; CHECK-LABEL: test3: +; CHECK: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; CHECK-NEXT: retl + + %tmp0 = extractelement <4 x float> %A, i32 0 + %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1 + %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2 + ret <4 x float> %tmp2 +} diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll deleted file mode 100644 index f811a74..0000000 --- a/test/CodeGen/X86/vec_set-5.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t -; RUN: grep movlhps %t | count 1 -; RUN: grep movq %t | count 2 - -define <4 x float> @test1(float %a, float %b) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] - %tmp8 = insertelement <4 x float> %tmp6, float %b, i32 2 ; <<4 x float>> [#uses=1] - %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp9 -} - -define <4 x float> @test2(float %a, float %b) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] - %tmp7 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] - %tmp8 = insertelement <4 x float> %tmp7, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp9 -} - -define <2 x i64> @test3(i32 %a, i32 %b) nounwind { - %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 ; <<4 x i32>> [#uses=1] - %tmp6 = insertelement <4 x i32> %tmp, i32 %b, i32 1 ; <<4 x i32>> [#uses=1] - %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 ; <<4 x i32>> [#uses=1] - %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3 ; <<4 x i32>> [#uses=1] - %tmp11 = bitcast <4 x i32> %tmp10 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp11 -} diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll deleted file mode 100644 index a739090..0000000 --- a/test/CodeGen/X86/vec_set-9.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s - -; CHECK: test3 -; CHECK: movd -; CHECK-NOT: movd -; CHECK: {{movlhps.*%xmm0, %xmm0}} -; CHECK-NEXT: ret - -define <2 x i64> @test3(i64 %A) nounwind { -entry: - %B = insertelement <2 x i64> undef, i64 %A, i32 1 - ret <2 x i64> %B -} - diff --git a/test/CodeGen/X86/vec_set-E.ll b/test/CodeGen/X86/vec_set-E.ll deleted file mode 100644 index d78be66..0000000 --- a/test/CodeGen/X86/vec_set-E.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq - -define <4 x float> @t(float %X) nounwind { - %tmp11 = insertelement <4 x float> undef, float %X, i32 0 - %tmp12 = insertelement <4 x float> %tmp11, float %X, i32 1 - %tmp27 = insertelement <4 x float> %tmp12, float 0.000000e+00, i32 2 - %tmp28 = insertelement <4 x float> %tmp27, float 0.000000e+00, i32 3 - ret <4 x float> %tmp28 -} diff --git a/test/CodeGen/X86/vec_set-G.ll b/test/CodeGen/X86/vec_set-G.ll deleted file mode 100644 index 4a542fe..0000000 --- a/test/CodeGen/X86/vec_set-G.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss - -define fastcc void @t(<4 x float> %A) nounwind { - %tmp41896 = extractelement <4 x float> %A, i32 0 ; <float> [#uses=1] - %tmp14082 = insertelement <4 x float> < float 0.000000e+00, float undef, float undef, float undef >, float %tmp41896, i32 1 ; <<4 x float>> [#uses=1] - %tmp14083 = insertelement <4 x float> %tmp14082, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp14083, <4 x float>* null, align 16 - ret void -} diff --git a/test/CodeGen/X86/vec_set-I.ll b/test/CodeGen/X86/vec_set-I.ll deleted file mode 100644 index c5d6ab8..0000000 --- a/test/CodeGen/X86/vec_set-I.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s - -; CHECK-NOT: xorp -; CHECK: movd -; CHECK-NOT: xorp - -define void @t1() nounwind { - %tmp298.i.i = load <4 x float>* null, align 16 - %tmp304.i.i = bitcast <4 x float> %tmp298.i.i to <4 x i32> - %tmp305.i.i = and <4 x i32> %tmp304.i.i, < i32 -1, i32 0, i32 0, i32 0 > - store <4 x i32> %tmp305.i.i, <4 x i32>* null, align 16 - unreachable -} diff --git a/test/CodeGen/X86/vec_set-J.ll b/test/CodeGen/X86/vec_set-J.ll deleted file mode 100644 index d90ab85..0000000 --- a/test/CodeGen/X86/vec_set-J.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss -; PR2472 - -define <4 x i32> @a(<4 x i32> %a) nounwind { -entry: - %vecext = extractelement <4 x i32> %a, i32 0 - insertelement <4 x i32> zeroinitializer, i32 %vecext, i32 0 - %add = add <4 x i32> %a, %0 - ret <4 x i32> %add -} diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll index 322dbae..b69f90c 100644 --- a/test/CodeGen/X86/vec_setcc.ll +++ b/test/CodeGen/X86/vec_setcc.ll @@ -62,8 +62,7 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s ; SSE2-LABEL: v8i16_icmp_ule: ; SSE2: psubusw %xmm1, %xmm0 ; SSE2: pxor %xmm1, %xmm1 -; SSE2: pcmpeqw %xmm0, %xmm1 -; SSE2: movdqa %xmm1, %xmm0 +; SSE2: pcmpeqw %xmm1, %xmm0 ; SSE41-LABEL: v8i16_icmp_ule: ; SSE41: pminuw %xmm0, %xmm1 @@ -106,8 +105,7 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s ; SSE2: pxor %xmm2, %xmm0 ; SSE2: pcmpgtd %xmm1, %xmm0 ; SSE2: pcmpeqd %xmm1, %xmm1 -; SSE2: pxor %xmm0, %xmm1 -; SSE2: movdqa %xmm1, %xmm0 +; SSE2: pxor %xmm1, %xmm0 ; SSE41-LABEL: v4i32_icmp_ule: ; SSE41: pminud %xmm0, %xmm1 diff --git a/test/CodeGen/X86/vec_sext.ll b/test/CodeGen/X86/vec_sext.ll deleted file mode 100644 index 776ddec..0000000 --- a/test/CodeGen/X86/vec_sext.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=x86-64 -; PR 9267 - -define<4 x i32> @func_16_32() { - %F = load <4 x i16>* undef - %G = sext <4 x i16> %F to <4 x i32> - %H = load <4 x i16>* undef - %Y = sext <4 x i16> %H to <4 x i32> - %T = add <4 x i32> %Y, %G - store <4 x i32>%T , <4 x i32>* undef - ret <4 x i32> %T -} - -define<4 x i64> @func_16_64() { - %F = load <4 x i16>* undef - %G = sext <4 x i16> %F to <4 x i64> - %H = load <4 x i16>* undef - %Y = sext <4 x i16> %H to <4 x i64> - %T = xor <4 x i64> %Y, %G - store <4 x i64>%T , <4 x i64>* undef - ret <4 x i64> %T -} - -define<4 x i64> @func_32_64() { - %F = load <4 x i32>* undef - %G = sext <4 x i32> %F to <4 x i64> - %H = load <4 x i32>* undef - %Y = sext <4 x i32> %H to <4 x i64> - %T = or <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i16> @func_8_16() { - %F = load <4 x i8>* undef - %G = sext <4 x i8> %F to <4 x i16> - %H = load <4 x i8>* undef - %Y = sext <4 x i8> %H to <4 x i16> - %T = add <4 x i16> %Y, %G - ret <4 x i16> %T -} - -define<4 x i32> @func_8_32() { - %F = load <4 x i8>* undef - %G = sext <4 x i8> %F to <4 x i32> - %H = load <4 x i8>* undef - %Y = sext <4 x i8> %H to <4 x i32> - %T = sub <4 x i32> %Y, %G - ret <4 x i32> %T -} - -define<4 x i64> @func_8_64() { - %F = load <4 x i8>* undef - %G = sext <4 x i8> %F to <4 x i64> - %H = load <4 x i8>* undef - %Y = sext <4 x i8> %H to <4 x i64> - %T = add <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i32> @const_16_32() { - %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32> - ret <4 x i32> %G -} - -define<4 x i64> @const_16_64() { - %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64> - ret <4 x i64> %G -} - diff --git a/test/CodeGen/X86/vec_shuffle-11.ll b/test/CodeGen/X86/vec_shuffle-11.ll deleted file mode 100644 index 640745a..0000000 --- a/test/CodeGen/X86/vec_shuffle-11.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep mov - -define <4 x i32> @test() nounwind { - %tmp131 = call <2 x i64> @llvm.x86.sse2.psrl.dq( <2 x i64> < i64 -1, i64 -1 >, i32 96 ) ; <<2 x i64>> [#uses=1] - %tmp137 = bitcast <2 x i64> %tmp131 to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp138 = and <4 x i32> %tmp137, bitcast (<2 x i64> < i64 -1, i64 -1 > to <4 x i32>) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp138 -} - -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll deleted file mode 100644 index 8f25197..0000000 --- a/test/CodeGen/X86/vec_shuffle-14.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32 -; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64 - -define <4 x i32> @t1(i32 %a) nounwind { -entry: - %tmp = insertelement <4 x i32> undef, i32 %a, i32 0 - %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> < i32 4, i32 1, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp6 - -; X86-32-LABEL: t1: -; X86-32: movd 4(%esp), %xmm0 - -; X86-64-LABEL: t1: -; X86-64: movd %e{{..}}, %xmm0 -} - -define <2 x i64> @t2(i64 %a) nounwind { -entry: - %tmp = insertelement <2 x i64> undef, i64 %a, i32 0 - %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %tmp, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1] - ret <2 x i64> %tmp6 - -; X86-32-LABEL: t2: -; X86-32: movq 4(%esp), %xmm0 - -; X86-64-LABEL: t2: -; X86-64: movd %r{{..}}, %xmm0 -} - -define <2 x i64> @t3(<2 x i64>* %a) nounwind { -entry: - %tmp4 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1] - %tmp6 = bitcast <2 x i64> %tmp4 to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp7 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp6, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - %tmp8 = bitcast <4 x i32> %tmp7 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp8 - -; X86-32-LABEL: t3: -; X86-32: movl 4(%esp) -; X86-32: movq - -; X86-64-LABEL: t3: -; X86-64: movq ({{.*}}), %xmm0 -} - -define <2 x i64> @t4(<2 x i64> %a) nounwind { -entry: - %tmp5 = bitcast <2 x i64> %a to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp5, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - %tmp7 = bitcast <4 x i32> %tmp6 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp7 - -; X86-32-LABEL: t4: -; X86-32: movq %xmm0, %xmm0 - -; X86-64-LABEL: t4: -; X86-64: movq {{.*}}, %xmm0 -} - -define <2 x i64> @t5(<2 x i64> %a) nounwind { -entry: - %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1] - ret <2 x i64> %tmp6 - -; X86-32-LABEL: t5: -; X86-32: movq %xmm0, %xmm0 - -; X86-64-LABEL: t5: -; X86-64: movq {{.*}}, %xmm0 -} diff --git a/test/CodeGen/X86/vec_shuffle-15.ll b/test/CodeGen/X86/vec_shuffle-15.ll deleted file mode 100644 index 5a9b8fd..0000000 --- a/test/CodeGen/X86/vec_shuffle-15.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 - -define <2 x i64> @t00(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t01(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t02(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t03(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t10(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t11(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t12(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t20(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t21(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t22(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t23(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t30(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t31(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t32(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t33(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 3 > - ret <2 x i64> %tmp -} diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll deleted file mode 100644 index 9aeb942..0000000 --- a/test/CodeGen/X86/vec_shuffle-16.ll +++ /dev/null @@ -1,43 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2 - -; sse-LABEL: t1: -; sse2-LABEL: t1: -define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %tmp1 -} - -; sse-LABEL: t2: -; sse2-LABEL: t2: -define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 > - ret <4 x float> %tmp -} - -; sse-LABEL: t3: -; sse2-LABEL: t3: -define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 > - ret <4 x float> %tmp -} - -; sse-LABEL: t4: -; sse2-LABEL: t4: -define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind { - -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 > - ret <4 x float> %tmp -} diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll deleted file mode 100644 index f2f96ba..0000000 --- a/test/CodeGen/X86/vec_shuffle-17.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s -; CHECK-NOT: xor -; CHECK: movd {{%rdi|%rcx}}, %xmm0 -; CHECK-NOT: xor -; PR2108 - -define <2 x i64> @doload64(i64 %x) nounwind { -entry: - %tmp717 = bitcast i64 %x to double ; <double> [#uses=1] - %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0 ; <<2 x double>> [#uses=1] - %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1 ; <<2 x double>> [#uses=1] - %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp11 -} - diff --git a/test/CodeGen/X86/vec_shuffle-18.ll b/test/CodeGen/X86/vec_shuffle-18.ll deleted file mode 100644 index 1104a4a..0000000 --- a/test/CodeGen/X86/vec_shuffle-18.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7 - - %struct.vector4_t = type { <4 x float> } - -define void @swizzle(i8* %a, %struct.vector4_t* %b, %struct.vector4_t* %c) nounwind { -entry: - %tmp9 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %tmp10 = load <4 x float>* %tmp9, align 16 ; <<4 x float>> [#uses=1] - %tmp14 = bitcast i8* %a to double* ; <double*> [#uses=1] - %tmp15 = load double* %tmp14 ; <double> [#uses=1] - %tmp16 = insertelement <2 x double> undef, double %tmp15, i32 0 ; <<2 x double>> [#uses=1] - %tmp18 = bitcast <2 x double> %tmp16 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp19 = shufflevector <4 x float> %tmp10, <4 x float> %tmp18, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp19, <4 x float>* %tmp9, align 16 - %tmp28 = getelementptr %struct.vector4_t* %c, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %tmp29 = load <4 x float>* %tmp28, align 16 ; <<4 x float>> [#uses=1] - %tmp26 = getelementptr i8* %a, i32 8 ; <i8*> [#uses=1] - %tmp33 = bitcast i8* %tmp26 to double* ; <double*> [#uses=1] - %tmp34 = load double* %tmp33 ; <double> [#uses=1] - %tmp35 = insertelement <2 x double> undef, double %tmp34, i32 0 ; <<2 x double>> [#uses=1] - %tmp37 = bitcast <2 x double> %tmp35 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp38 = shufflevector <4 x float> %tmp29, <4 x float> %tmp37, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp38, <4 x float>* %tmp28, align 16 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll deleted file mode 100644 index 48db8de..0000000 --- a/test/CodeGen/X86/vec_shuffle-19.ll +++ /dev/null @@ -1,9 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4 -; PR2485 - -define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind { -entry: - %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %shuffle -} diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll deleted file mode 100644 index 5a2c444..0000000 --- a/test/CodeGen/X86/vec_shuffle-20.ll +++ /dev/null @@ -1,8 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2 - -define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind { -entry: - shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>>:0 [#uses=1] - ret <4 x float> %0 -} diff --git a/test/CodeGen/X86/vec_shuffle-22.ll b/test/CodeGen/X86/vec_shuffle-22.ll deleted file mode 100644 index 6807e4d..0000000 --- a/test/CodeGen/X86/vec_shuffle-22.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium-m | FileCheck %s - -define <4 x float> @t1(<4 x float> %a) nounwind { -; CHECK: movlhps - %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp1 -} - -define <4 x i32> @t2(<4 x i32>* %a) nounwind { -; CHECK: pshufd -; CHECK: ret - %tmp1 = load <4 x i32>* %a - %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp2 -} diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll deleted file mode 100644 index 2468735..0000000 --- a/test/CodeGen/X86/vec_shuffle-23.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep punpck -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd - -define i32 @t() nounwind { -entry: - %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] - %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] - store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a - %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] - store <4 x i32> %tmp, <4 x i32>* %b - %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] - store <4 x i32> %punpckldq, <4 x i32>* %b - %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %result = extractelement <4 x i32> %tmp3, i32 0 ; <i32> [#uses=1] - ret i32 %result -} diff --git a/test/CodeGen/X86/vec_shuffle-24.ll b/test/CodeGen/X86/vec_shuffle-24.ll deleted file mode 100644 index d038daf..0000000 --- a/test/CodeGen/X86/vec_shuffle-24.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s - -define i32 @t() nounwind optsize { -entry: -; CHECK: punpckldq - %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] - %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] - store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a - %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] - store <4 x i32> %tmp, <4 x i32>* %b - %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] - store <4 x i32> %punpckldq, <4 x i32>* %b - %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %result = extractelement <4 x i32> %tmp3, i32 0 ; <i32> [#uses=1] - ret i32 %result -} diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll deleted file mode 100644 index 3f42a13..0000000 --- a/test/CodeGen/X86/vec_shuffle-25.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t -; RUN: grep unpcklps %t | count 3 -; RUN: grep unpckhps %t | count 1 - -; Transpose example using the more generic vector shuffle. We return -; float8 instead of float16 since x86 can return that in register. -; ModuleID = 'transpose2_opt.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-apple-cl.1.0" -@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] - -define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { -entry: - %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpcklps14a = shufflevector <4 x float> %unpcklps14, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %unpckhps17a = shufflevector <4 x float> %unpckhps17, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r1 = shufflevector <16 x float> %unpcklps14a, <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpcklps20a = shufflevector <4 x float> %unpcklps20, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r2 = shufflevector <16 x float> %r1, <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15> - %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %unpckhps23a = shufflevector <4 x float> %unpckhps23, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r3 = shufflevector <16 x float> %r2, <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19> - %r4 = shufflevector <16 x float> %r3, <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <8 x float> %r4 -} diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll deleted file mode 100644 index 00e8e73..0000000 --- a/test/CodeGen/X86/vec_shuffle-26.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s -; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s - -; Transpose example using the more generic vector shuffle. Return float8 -; instead of float16 -; ModuleID = 'transpose2_opt.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-apple-cl.1.0" -@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] - -define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { -entry: -; CHECK: transpose2 -; CHECK: unpckhps -; CHECK: unpckhps -; CHECK: unpcklps -; CHECK: unpckhps -; Different instruction order for Atom. -; ATOM: transpose2 -; ATOM: unpckhps -; ATOM: unpckhps -; ATOM: unpckhps -; ATOM: unpcklps - %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %r1 = shufflevector <4 x float> %unpcklps14, <4 x float> %unpckhps17, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > - %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %r2 = shufflevector <4 x float> %unpcklps20, <4 x float> %unpckhps23, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > -; %r3 = shufflevector <8 x float> %r1, <8 x float> %r2, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; - ret <8 x float> %r2 -} - -define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind { -entry: -; movhps should happen before extractps to assure it gets the correct value. -; CHECK: lo_hi_shift -; CHECK: movhps ([[BASEREG:%[a-z]+]]), -; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; ATOM: lo_hi_shift -; ATOM: movhps ([[BASEREG:%[a-z]+]]), -; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) - %v.i = bitcast float* %y to <4 x float>* - %0 = load <4 x float>* %v.i, align 1 - %1 = bitcast float* %x to <1 x i64>* - %.val = load <1 x i64>* %1, align 1 - %2 = bitcast <1 x i64> %.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %cast.i = bitcast <4 x float> %0 to <2 x i64> - %extract.i = extractelement <2 x i64> %cast.i, i32 1 - %3 = bitcast float* %x to i64* - store i64 %extract.i, i64* %3, align 4 - %4 = bitcast <4 x float> %0 to <16 x i8> - %5 = bitcast <4 x float> %shuffle1.i to <16 x i8> - %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> - %6 = bitcast <16 x i8> %palignr to <2 x i64> - ret <2 x i64> %6 -} diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll deleted file mode 100644 index c9b2fb5..0000000 --- a/test/CodeGen/X86/vec_shuffle-27.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -; ModuleID = 'vec_shuffle-27.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i686-apple-cl.1.0" - -define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone { -entry: -; CHECK: subps -; CHECK: subps -; CHECK: mulps -; CHECK: mulps -; CHECK: addps -; CHECK: addps - %tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 > ; <<8 x float>> [#uses=1] - %sub = fsub <8 x float> %T1, %T0 ; <<8 x float>> [#uses=1] - %mul = fmul <8 x float> %sub, %tmp7 ; <<8 x float>> [#uses=1] - %add = fadd <8 x float> %mul, %T0 ; <<8 x float>> [#uses=1] - ret <8 x float> %add -} - -; Test case for r122206 -define void @test2(<4 x i64>* %ap, <4 x i64>* %bp) nounwind { -entry: -; CHECK: movdqa - %a = load <4 x i64> * %ap - %b = load <4 x i64> * %bp - %mulaa = mul <4 x i64> %a, %a - %mulbb = mul <4 x i64> %b, %b - %mulab = mul <4 x i64> %a, %b - %vect1271 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef> - %vect1272 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef> - %vect1487 = shufflevector <4 x i64> %vect1271, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 4> - %vect1488 = shufflevector <4 x i64> %vect1272, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 5> - store <4 x i64> %vect1487, <4 x i64>* %ap - store <4 x i64> %vect1488, <4 x i64>* %bp - ret void; -} diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll deleted file mode 100644 index ebf5577..0000000 --- a/test/CodeGen/X86/vec_shuffle-28.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s - -; CHECK: pshufb -; CHECK-NOT: pshufb - -; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently. -; Don't XFAIL it because it's still better than the previous code. - -; Pack various elements via shuffles. -define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp7 -} diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll deleted file mode 100644 index f5f8842..0000000 --- a/test/CodeGen/X86/vec_shuffle-30.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s - -; CHECK: test -; Test case when creating pshufhw, we incorrectly set the higher order bit -; for an undef, -define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind { -entry: -; CHECK-NOT: vmovaps -; CHECK: vmovlpd -; CHECK: vpshufhw $-95 - %0 = load <8 x i16>* %dest - %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14> - store <8 x i16> %1, <8 x i16>* %dest - ret void -} - -; CHECK: test2 -; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq -define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind { -entry: -; CHECK-NOT: pslldq -; CHECK: shufps - %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2> - store <4 x i32> %0, <4 x i32>* %dest - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll deleted file mode 100644 index bb06e15..0000000 --- a/test/CodeGen/X86/vec_shuffle-31.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 -o %t -; RUN: grep pshufb %t | count 1 - -define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll deleted file mode 100644 index d057b3f..0000000 --- a/test/CodeGen/X86/vec_shuffle-34.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | grep pshufb | count 2 - -define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -} diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll deleted file mode 100644 index f5083b4..0000000 --- a/test/CodeGen/X86/vec_shuffle-35.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t -; RUN: grep pextrw %t | count 12 -; RUN: grep pinsrw %t | count 13 -; RUN: grep rolw %t | count 13 -; RUN: not grep esp %t -; RUN: not grep ebp %t -; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t -; RUN: grep pshufb %t | count 3 - -define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone { -entry: - %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > - ret <16 x i8> %tmp8 -} - -define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > - ret <16 x i8> %tmp8 -} diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll deleted file mode 100644 index f1d0f93..0000000 --- a/test/CodeGen/X86/vec_shuffle-36.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK: ret -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 2, i32 0, i32 2, i32 1, i32 5, i32 6 , i32 undef > - ret <8 x i16> %tmp9 -} - -define <8 x i16> @shuf7(<8 x i16> %t0) { -; CHECK: pshufd - %tmp10 = shufflevector <8 x i16> %t0, <8 x i16> undef, <8 x i32> < i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef > - ret <8 x i16> %tmp10 -} diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll deleted file mode 100644 index ed285f9..0000000 --- a/test/CodeGen/X86/vec_shuffle-37.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s -; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0 - -define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp { -entry: -; CHECK: movaps ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]] -; CHECK: movaps %[[XMM0]], %[[XMM1:xmm[0-9]+]] -; CHECK-NEXT: movss %xmm{{[0-9]+}}, %[[XMM1]] -; CHECK-NEXT: shufps $36, %[[XMM1]], %[[XMM0]] - %0 = load <4 x i32>* undef, align 16 - %1 = load <4 x i32>* %a0, align 16 - %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> - ret <4 x i32> %2 -} - -define void @t01(double* %a0) nounwind ssp { -entry: -; CHECK_O0: movsd (%eax), %xmm0 -; CHECK_O0: unpcklpd %xmm0, %xmm0 - %tmp93 = load double* %a0, align 8 - %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1 - store <2 x double> %vecinit94, <2 x double>* undef - ret void -} - -define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline { -entry: -; CHECK: t02 -; CHECK: movaps -; CHECK: shufps -; CHECK: pshufd -; CHECK: movq -; CHECK: ret - %0 = bitcast <8 x i32>* %source to <4 x i32>* - %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3 - %tmp2 = load <4 x i32>* %arrayidx, align 16 - %tmp3 = extractelement <4 x i32> %tmp2, i32 0 - %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0 - %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1 - %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>* - %tmp8 = load <4 x i32>* %1, align 16 - %tmp9 = extractelement <4 x i32> %tmp8, i32 1 - %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1 - store <2 x i32> %tmp11, <2 x i32>* %dest, align 8 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll deleted file mode 100644 index ec196df..0000000 --- a/test/CodeGen/X86/vec_shuffle-38.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s - -define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp { -; CHECK: unpcklpd - %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> zeroinitializer - ret <2 x double> %shuffle -} - -define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp { -; CHECK: unpckhpd - %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1> - ret <2 x double> %shuffle -} - -define <2 x i64> @ldi(<2 x i64> %p) nounwind optsize ssp { -; CHECK: punpcklqdq - %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> zeroinitializer - ret <2 x i64> %shuffle -} - -define <2 x i64> @hdi(<2 x i64> %p) nounwind optsize ssp { -; CHECK: punpckhqdq - %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> <i32 1, i32 1> - ret <2 x i64> %shuffle -} - -; rdar://10050549 -%struct.Float2 = type { float, float } - -define <4 x float> @loadhpi(%struct.Float2* %vPtr, <4 x float> %vecin1) nounwind readonly ssp { -entry: -; CHECK: loadhpi -; CHECK-NOT: movq -; CHECK: movhps ( - %tmp1 = bitcast %struct.Float2* %vPtr to <1 x i64>* - %addptr7 = getelementptr inbounds <1 x i64>* %tmp1, i64 0 - %tmp2 = bitcast <1 x i64>* %addptr7 to float* - %tmp3 = load float* %tmp2, align 4 - %vec = insertelement <4 x float> undef, float %tmp3, i32 0 - %addptr.i12 = getelementptr inbounds float* %tmp2, i64 1 - %tmp4 = load float* %addptr.i12, align 4 - %vecin2 = insertelement <4 x float> %vec, float %tmp4, i32 1 - %shuffle = shufflevector <4 x float> %vecin1, <4 x float> %vecin2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - ret <4 x float> %shuffle -} - -; rdar://10119696 -; CHECK: f -define <4 x float> @f(<4 x float> %x, double* nocapture %y) nounwind readonly ssp { -entry: - ; CHECK: movlps (%{{rdi|rdx}}), %xmm0 - %u110.i = load double* %y, align 1 - %tmp8.i = insertelement <2 x double> undef, double %u110.i, i32 0 - %tmp9.i = bitcast <2 x double> %tmp8.i to <4 x float> - %shuffle.i = shufflevector <4 x float> %x, <4 x float> %tmp9.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %shuffle.i -} - -define <4 x float> @loadhpi2(%struct.Float2* nocapture %vHiCoefPtr_0, %struct.Float2* nocapture %vLoCoefPtr_0, i32 %s) nounwind readonly ssp { -entry: -; CHECK: loadhpi2 -; CHECK: movhps ( -; CHECK-NOT: movlhps - %0 = bitcast %struct.Float2* %vHiCoefPtr_0 to <1 x i64>* - %idx.ext = sext i32 %s to i64 - %add.ptr = getelementptr inbounds <1 x i64>* %0, i64 %idx.ext - %add.ptr.val = load <1 x i64>* %add.ptr, align 1 - %1 = bitcast <1 x i64> %add.ptr.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %2 = bitcast %struct.Float2* %vLoCoefPtr_0 to <1 x i64>* - %add.ptr2 = getelementptr inbounds <1 x i64>* %2, i64 %idx.ext - %add.ptr2.val = load <1 x i64>* %add.ptr2, align 1 - %3 = bitcast <1 x i64> %add.ptr2.val to <2 x float> - %shuffle.i4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %shuffle1.i5 = shufflevector <4 x float> %shuffle.i, <4 x float> %shuffle.i4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - ret <4 x float> %shuffle1.i5 -} diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll deleted file mode 100644 index 8fd9a5c..0000000 --- a/test/CodeGen/X86/vec_shuffle-39.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s -; rdar://10050222, rdar://10134392 - -define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind { -entry: -; CHECK-LABEL: t1: -; CHECK: movlps (%rdi), %xmm0 -; CHECK: ret - %p.val = load <1 x i64>* %p, align 1 - %0 = bitcast <1 x i64> %p.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %shuffle1.i -} - -define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind { -entry: -; CHECK-LABEL: t1a: -; CHECK: movlps (%rdi), %xmm0 -; CHECK: ret - %0 = bitcast <1 x i64>* %p to double* - %1 = load double* %0 - %2 = insertelement <2 x double> undef, double %1, i32 0 - %3 = bitcast <2 x double> %2 to <4 x float> - %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %4 -} - -define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind { -entry: -; CHECK-LABEL: t2: -; CHECK: movlps %xmm0, (%rdi) -; CHECK: ret - %cast.i = bitcast <4 x float> %a to <2 x i64> - %extract.i = extractelement <2 x i64> %cast.i, i32 0 - %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0 - store i64 %extract.i, i64* %0, align 8 - ret void -} - -define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind { -entry: -; CHECK-LABEL: t2a: -; CHECK: movlps %xmm0, (%rdi) -; CHECK: ret - %0 = bitcast <1 x i64>* %p to double* - %1 = bitcast <4 x float> %a to <2 x double> - %2 = extractelement <2 x double> %1, i32 0 - store double %2, double* %0 - ret void -} - -; rdar://10436044 -define <2 x double> @t3() nounwind readonly { -bb: -; CHECK-LABEL: t3: -; CHECK: movq (%rax), %xmm1 -; CHECK: punpcklqdq %xmm2, %xmm0 -; CHECK: movsd %xmm1, %xmm0 - %tmp0 = load i128* null, align 1 - %tmp1 = load <2 x i32>* undef, align 8 - %tmp2 = bitcast i128 %tmp0 to <16 x i8> - %tmp3 = bitcast <2 x i32> %tmp1 to i64 - %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 - %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double> - %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double> - %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1> - ret <2 x double> %tmp7 -} - -; rdar://10450317 -define <2 x i64> @t4() nounwind readonly { -bb: -; CHECK-LABEL: t4: -; CHECK: movq (%rax), %xmm0 -; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]] -; CHECK: movsd %[[XMM]], %xmm0 - %tmp0 = load i128* null, align 1 - %tmp1 = load <2 x i32>* undef, align 8 - %tmp2 = bitcast i128 %tmp0 to <16 x i8> - %tmp3 = bitcast <2 x i32> %tmp1 to i64 - %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 - %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64> - %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1> - ret <2 x i64> %tmp6 -} diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll deleted file mode 100644 index 75b45e3..0000000 --- a/test/CodeGen/X86/vec_shuffle-40.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s - -define void @shuffle_v16i16(<16 x i16>* %a) { -; CHECK-LABEL: shuffle_v16i16: -; CHECK: vpshufb {{.*}}%ymm -; CHECK-NOT: vpshufb {{.*}}%xmm -entry: - %0 = load <16 x i16>* %a, align 32 - %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> - store <16 x i16> %shuffle, <16 x i16>* %a, align 32 - ret void -} - -define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) { -; CHECK-LABEL: shuffle_v16i16_lanecrossing: -; CHECK-NOT: vpshufb {{.*}}%ymm -entry: - %0 = load <16 x i16>* %a, align 32 - %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> - store <16 x i16> %shuffle, <16 x i16>* %a, align 32 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll deleted file mode 100644 index 28fdd2f..0000000 --- a/test/CodeGen/X86/vec_shuffle-41.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s - -; Use buildFromShuffleMostly which allows this to be generated as two 128-bit -; shuffles and an insert. - -; This is the (somewhat questionable) LLVM IR that is generated for: -; x8.s0123456 = x8.s1234567; // x8 is a <8 x float> type -; x8.s7 = f; // f is float - - -define <8 x float> @test1(<8 x float> %a, float %b) { -; CHECK-LABEL: test1: -; CHECK: vinsertps -; CHECK-NOT: vinsertps -entry: - %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef> - %insert = insertelement <8 x float> %extend, float %b, i32 7 - - ret <8 x float> %insert -} diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll deleted file mode 100644 index 6599598..0000000 --- a/test/CodeGen/X86/vec_shuffle.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc < %s -mtriple=i686-linux -mcpu=core2 | FileCheck %s - -; CHECK: test_v4sf -; CHECK: movq 8(%esp) -; CHECK: pshufd $80 -define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] - %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] - %tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp4, float %Y, i32 3 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp6, <4 x float>* %P - ret void -} - -; CHECK: test_v2sd -; CHECK: movups 8(%esp) -; CHECK: movaps -define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind { - %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] - %tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1 ; <<2 x double>> [#uses=1] - store <2 x double> %tmp2, <2 x double>* %P - ret void -} - -; CHECK: test_v8i16 -; CHECK: pshufhw $-58 -; CHECK: movdqa -define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind { - %tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1] - %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8] - %tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; <i16> [#uses=1] - %tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 ; <i16> [#uses=1] - %tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 ; <i16> [#uses=1] - %tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 ; <i16> [#uses=1] - %tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 ; <i16> [#uses=1] - %tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 ; <i16> [#uses=1] - %tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 ; <i16> [#uses=1] - %tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 ; <i16> [#uses=1] - %tmp8 = insertelement <8 x i16> undef, i16 %tmp.upgrd.2, i32 0 ; <<8 x i16>> [#uses=1] - %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 ; <<8 x i16>> [#uses=1] - %tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp2, i32 2 ; <<8 x i16>> [#uses=1] - %tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 ; <<8 x i16>> [#uses=1] - %tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 4 ; <<8 x i16>> [#uses=1] - %tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 ; <<8 x i16>> [#uses=1] - %tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 6 ; <<8 x i16>> [#uses=1] - %tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 ; <<8 x i16>> [#uses=1] - %tmp15.upgrd.3 = bitcast <8 x i16> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] - store <2 x i64> %tmp15.upgrd.3, <2 x i64>* %res - ret void -} diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll deleted file mode 100644 index 9d82f97..0000000 --- a/test/CodeGen/X86/vec_splat-2.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s - -define void @test(<2 x i64>* %P, i8 %x) nounwind { - %tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1] - %tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1 ; <<16 x i8>> [#uses=1] - %tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2 ; <<16 x i8>> [#uses=1] - %tmp40 = insertelement <16 x i8> %tmp38, i8 %x, i32 3 ; <<16 x i8>> [#uses=1] - %tmp42 = insertelement <16 x i8> %tmp40, i8 %x, i32 4 ; <<16 x i8>> [#uses=1] - %tmp44 = insertelement <16 x i8> %tmp42, i8 %x, i32 5 ; <<16 x i8>> [#uses=1] - %tmp46 = insertelement <16 x i8> %tmp44, i8 %x, i32 6 ; <<16 x i8>> [#uses=1] - %tmp48 = insertelement <16 x i8> %tmp46, i8 %x, i32 7 ; <<16 x i8>> [#uses=1] - %tmp50 = insertelement <16 x i8> %tmp48, i8 %x, i32 8 ; <<16 x i8>> [#uses=1] - %tmp52 = insertelement <16 x i8> %tmp50, i8 %x, i32 9 ; <<16 x i8>> [#uses=1] - %tmp54 = insertelement <16 x i8> %tmp52, i8 %x, i32 10 ; <<16 x i8>> [#uses=1] - %tmp56 = insertelement <16 x i8> %tmp54, i8 %x, i32 11 ; <<16 x i8>> [#uses=1] - %tmp58 = insertelement <16 x i8> %tmp56, i8 %x, i32 12 ; <<16 x i8>> [#uses=1] - %tmp60 = insertelement <16 x i8> %tmp58, i8 %x, i32 13 ; <<16 x i8>> [#uses=1] - %tmp62 = insertelement <16 x i8> %tmp60, i8 %x, i32 14 ; <<16 x i8>> [#uses=1] - %tmp64 = insertelement <16 x i8> %tmp62, i8 %x, i32 15 ; <<16 x i8>> [#uses=1] - %tmp68 = load <2 x i64>* %P ; <<2 x i64>> [#uses=1] - %tmp71 = bitcast <2 x i64> %tmp68 to <16 x i8> ; <<16 x i8>> [#uses=1] - %tmp73 = add <16 x i8> %tmp71, %tmp64 ; <<16 x i8>> [#uses=1] - %tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64> ; <<2 x i64>> [#uses=1] - store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P - ret void - -; CHECK-LABEL: test: -; CHECK-NOT: pshufd -; CHECK: punpcklbw -; CHECK: punpcklbw -; CHECK: pshufd $0 -; CHECK-NOT: pshufd -} diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll deleted file mode 100644 index 754cbf4..0000000 --- a/test/CodeGen/X86/vec_splat-3.ll +++ /dev/null @@ -1,230 +0,0 @@ -; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -; Splat test for v8i16 -define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_0: -; CHECK: pshuflw $0 -} - -define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_1: -; CHECK: pshuflw $5 -} - -define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_2: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $-86 -} - -define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_3: -; CHECK: pshuflw $15 -} - -define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_4: -; CHECK: movhlps -} - -define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_5: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $85 -} - -define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_6: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $-86 -} - -define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_7: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $-1 -} - -; Splat test for v16i8 -define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_8: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $0 -} - -define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_9: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_10: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_11: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-1 -} - - -define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_12: -; CHECK: pshufd $5 -} - -define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_13: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_14: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_15: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-1 -} - -define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_16: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $0 -} - -define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_17: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_18: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_19: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-1 -} - -define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_20: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $0 -} - -define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_21: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_22: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_23: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-1 -} diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll deleted file mode 100644 index 28f2a90..0000000 --- a/test/CodeGen/X86/vec_splat.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2 -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3 -; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX - -define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] - %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] - %tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3 ; <<4 x float>> [#uses=1] - %tmp8 = load <4 x float>* %Q ; <<4 x float>> [#uses=1] - %tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp10, <4 x float>* %P - ret void - -; SSE2-LABEL: test_v4sf: -; SSE2: pshufd $0 - -; SSE3-LABEL: test_v4sf: -; SSE3: pshufd $0 -} - -define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { - %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] - %tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1] - %tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1] - %tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1] - store <2 x double> %tmp6, <2 x double>* %P - ret void - -; SSE2-LABEL: test_v2sd: -; SSE2: shufpd $0 - -; SSE3-LABEL: test_v2sd: -; SSE3: movddup -} - -; Fold extract of a load into the load's address computation. This avoids spilling to the stack. -define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind { - %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i - %2 = load <4 x float>* %1, align 16 - %3 = trunc i64 %j to i32 - %4 = extractelement <4 x float> %2, i32 %3 - %5 = insertelement <4 x float> undef, float %4, i32 0 - %6 = insertelement <4 x float> %5, float %4, i32 1 - %7 = insertelement <4 x float> %6, float %4, i32 2 - %8 = insertelement <4 x float> %7, float %4, i32 3 - ret <4 x float> %8 - -; AVX-LABEL: load_extract_splat -; AVX-NOT: rsp -; AVX: vbroadcastss -} - -; Fold extract of a load into the load's address computation. This avoids spilling to the stack. -define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind { - %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i - %2 = load <4 x float>* %1, align 16 - %3 = extractelement <4 x float> %2, i64 %j - %4 = insertelement <4 x float> undef, float %3, i32 0 - %5 = insertelement <4 x float> %4, float %3, i32 1 - %6 = insertelement <4 x float> %5, float %3, i32 2 - %7 = insertelement <4 x float> %6, float %3, i32 3 - ret <4 x float> %7 - -; AVX-LABEL: load_extract_splat1 -; AVX-NOT: movs -; AVX: vbroadcastss -} diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll new file mode 100644 index 0000000..3c446bb --- /dev/null +++ b/test/CodeGen/X86/vec_trunc_sext.ll @@ -0,0 +1,30 @@ +; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - | FileCheck %s -check-prefix=NO_SSE_41 +; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - | FileCheck %s -check-prefix=SSE_41 + +; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 ) +; When sexting a trunc'd vector value, we can't eliminate the zext. +; If we don't have SSE4.1, use punpck. +; If we have SSE4.1, use pmovzx because it combines the load op. +; There may be a better way to do this using pshufb + pmovsx, +; but that is beyond our current codegen capabilities. + +define <4 x i32> @trunc_sext(<4 x i16>* %in) { + %load = load <4 x i16>* %in + %trunc = trunc <4 x i16> %load to <4 x i8> + %sext = sext <4 x i8> %trunc to <4 x i32> + ret <4 x i32> %sext + +; NO_SSE_41-LABEL: trunc_sext: +; NO_SSE_41: movq (%rdi), %xmm0 +; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0 +; NO_SSE_41-NEXT: pslld $24, %xmm0 +; NO_SSE_41-NEXT: psrad $24, %xmm0 +; NO_SSE_41-NEXT: retq + +; SSE_41-LABEL: trunc_sext: +; SSE_41: pmovzxwd (%rdi), %xmm0 +; SSE_41-NEXT: pslld $24, %xmm0 +; SSE_41-NEXT: psrad $24, %xmm0 +; SSE_41-NEXT: retq +} + diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll index ee20f1f..46cfcd9 100644 --- a/test/CodeGen/X86/vec_uint_to_fp.ll +++ b/test/CodeGen/X86/vec_uint_to_fp.ll @@ -1,11 +1,167 @@ -; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s + +; Check that the constant used in the vectors are the right ones. +; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]: +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff + +; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 + +; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 + +; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 + +; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 1258291200 ## 0x4b000000 + +; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 1392508928 ## 0x53000000 + +; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11 -; Test that we are not lowering uinttofp to scalars define <4 x float> @test1(<4 x i32> %A) nounwind { ; CHECK-LABEL: test1: -; CHECK-NOT: cvtsd2ss -; CHECK: ret +; +; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] +; SSE-NEXT: pand %xmm0, [[MASK]] +; After this instruction, MASK will have the value of the low parts +; of the vector. +; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 +; SSE-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0 +; SSE-NEXT: addps [[MASK]], %xmm0 +; SSE-NEXT: retq +; +; Currently we commute the arguments of the first blend, but this could be +; improved to match the lowering of the second blend. +; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] +; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 +; SSE41-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0 +; SSE41-NEXT: addps [[LOWVEC]], %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] +; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] +; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] +; AVX-NEXT: vaddps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] +; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 +; AVX-NEXT: retq +; +; The lowering for AVX2 is a bit messy, because we select broadcast +; instructions, instead of folding the constant loads. +; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] +; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] +; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] +; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] +; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] +; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 +; AVX2-NEXT: retq %C = uitofp <4 x i32> %A to <4 x float> ret <4 x float> %C } +; Match the AVX2 constants used in the next function +; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 1258291200 ## 0x4b000000 + +; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 1392508928 ## 0x53000000 + +; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11 + +define <8 x float> @test2(<8 x i32> %A) nounwind { +; CHECK-LABEL: test2: +; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX. +; The constant used for in the vector instruction are shared between the +; two sequences of instructions. +; +; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] +; SSE-NEXT: pand %[[MASK]], [[VECLOW]] +; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] +; SSE-NEXT: por %[[LOWCST]], [[VECLOW]] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] +; SSE-NEXT: por %[[HIGHCST]], %xmm0 +; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] +; SSE-NEXT: addps %[[MAGICCST]], %xmm0 +; SSE-NEXT: addps [[VECLOW]], %xmm0 +; MASK is the low vector of the second part after this point. +; SSE-NEXT: pand %xmm1, %[[MASK]] +; SSE-NEXT: por %[[LOWCST]], %[[MASK]] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %[[HIGHCST]], %xmm1 +; SSE-NEXT: addps %[[MAGICCST]], %xmm1 +; SSE-NEXT: addps %[[MASK]], %xmm1 +; SSE-NEXT: retq +; +; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] +; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 +; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] +; SSE41-NEXT: addps %[[MAGICCST]], %xmm0 +; SSE41-NEXT: addps [[VECLOW]], %xmm0 +; LOWCST is the low vector of the second part after this point. +; The operands of the blend are inverted because we reuse xmm1 +; in the next shift. +; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 +; SSE41-NEXT: addps %[[MAGICCST]], %xmm1 +; SSE41-NEXT: addps %[[LOWCST]], %xmm1 +; SSE41-NEXT: retq +; +; Test that we are not lowering uinttofp to scalars +; AVX-NOT: cvtsd2ss +; AVX: retq +; +; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] +; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] +; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] +; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] +; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] +; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 +; AVX2-NEXT: retq + %C = uitofp <8 x i32> %A to <8 x float> + ret <8 x float> %C +} + +define <4 x double> @test3(<4 x i32> %arg) { +; CHECK-LABEL: test3: +; This test used to crash because we were custom lowering it as if it was +; a conversion between <4 x i32> and <4 x float>. +; AVX: vcvtdq2pd +; AVX2: vcvtdq2pd +; CHECK: retq + %tmp = uitofp <4 x i32> %arg to <4 x double> + ret <4 x double> %tmp +} diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll new file mode 100644 index 0000000..827d418 --- /dev/null +++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s + +; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math. + +; Subtracting zero is free. +define <4 x float> @vec_fsub_zero(<4 x float> %x) { +; CHECK-LABEL: vec_fsub_zero: +; CHECK-NOT: subps +; CHECK-NOT: xorps +; CHECK: retq + %sub = fsub <4 x float> %x, zeroinitializer + ret <4 x float> %sub +} + +; Negating doesn't require subtraction. +define <4 x float> @vec_fneg(<4 x float> %x) { +; CHECK-LABEL: vec_fneg: +; CHECK: xorps {{.*}}LCP{{.*}}, %xmm0 +; CHECK-NOT: subps +; CHECK-NEXT: retq + %sub = fsub <4 x float> zeroinitializer, %x + ret <4 x float> %sub +} diff --git a/test/CodeGen/X86/vec_zext.ll b/test/CodeGen/X86/vec_zext.ll deleted file mode 100644 index 615a50b..0000000 --- a/test/CodeGen/X86/vec_zext.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=x86-64 -; PR 9267 - -define<4 x i32> @func_16_32() { - %F = load <4 x i16>* undef - %G = zext <4 x i16> %F to <4 x i32> - %H = load <4 x i16>* undef - %Y = zext <4 x i16> %H to <4 x i32> - %T = add <4 x i32> %Y, %G - store <4 x i32>%T , <4 x i32>* undef - ret <4 x i32> %T -} - -define<4 x i64> @func_16_64() { - %F = load <4 x i16>* undef - %G = zext <4 x i16> %F to <4 x i64> - %H = load <4 x i16>* undef - %Y = zext <4 x i16> %H to <4 x i64> - %T = xor <4 x i64> %Y, %G - store <4 x i64>%T , <4 x i64>* undef - ret <4 x i64> %T -} - -define<4 x i64> @func_32_64() { - %F = load <4 x i32>* undef - %G = zext <4 x i32> %F to <4 x i64> - %H = load <4 x i32>* undef - %Y = zext <4 x i32> %H to <4 x i64> - %T = or <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i16> @func_8_16() { - %F = load <4 x i8>* undef - %G = zext <4 x i8> %F to <4 x i16> - %H = load <4 x i8>* undef - %Y = zext <4 x i8> %H to <4 x i16> - %T = add <4 x i16> %Y, %G - ret <4 x i16> %T -} - -define<4 x i32> @func_8_32() { - %F = load <4 x i8>* undef - %G = zext <4 x i8> %F to <4 x i32> - %H = load <4 x i8>* undef - %Y = zext <4 x i8> %H to <4 x i32> - %T = sub <4 x i32> %Y, %G - ret <4 x i32> %T -} - -define<4 x i64> @func_8_64() { - %F = load <4 x i8>* undef - %G = zext <4 x i8> %F to <4 x i64> - %H = load <4 x i8>* undef - %Y = zext <4 x i8> %H to <4 x i64> - %T = add <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i32> @const_16_32() { - %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32> - ret <4 x i32> %G -} - -define<4 x i64> @const_16_64() { - %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64> - ret <4 x i64> %G -} - diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll new file mode 100644 index 0000000..0a3ed7e --- /dev/null +++ b/test/CodeGen/X86/vector-blend.ll @@ -0,0 +1,708 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +; AVX128 tests: + +define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { +; SSE2-LABEL: vsel_float: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_float: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_float: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_float: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2 + ret <4 x float> %vsel +} + +define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) { +; SSE-LABEL: vsel_float2: +; SSE: # BB#0: # %entry +; SSE-NEXT: movss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vsel_float2: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2 + ret <4 x float> %vsel +} + +define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { +; SSE2-LABEL: vsel_4xi8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_4xi8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_4xi8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_4xi8: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_4xi8: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2 + ret <4 x i8> %vsel +} + +define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { +; SSE2-LABEL: vsel_4xi16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_4xi16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_4xi16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_4xi16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_4xi16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2 + ret <4 x i16> %vsel +} + +define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { +; SSE2-LABEL: vsel_i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2 + ret <4 x i32> %vsel +} + +define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { +; SSE-LABEL: vsel_double: +; SSE: # BB#0: # %entry +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vsel_double: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2 + ret <2 x double> %vsel +} + +define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { +; SSE-LABEL: vsel_i64: +; SSE: # BB#0: # %entry +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: vsel_i64: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2 + ret <2 x i64> %vsel +} + +define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { +; SSE2-LABEL: vsel_8xi16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_8xi16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_8xi16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_8xi16: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2 + ret <8 x i16> %vsel +} + +define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { +; SSE2-LABEL: vsel_i8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_i8: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2 + ret <16 x i8> %vsel +} + + +; AVX256 tests: + +define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { +; SSE-LABEL: vsel_float8: +; SSE: # BB#0: # %entry +; SSE-NEXT: movss %xmm0, %xmm2 +; SSE-NEXT: movss %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: vsel_float8: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2 + ret <8 x float> %vsel +} + +define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { +; SSE-LABEL: vsel_i328: +; SSE: # BB#0: # %entry +; SSE-NEXT: movss %xmm0, %xmm2 +; SSE-NEXT: movss %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: vsel_i328: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i328: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2 + ret <8 x i32> %vsel +} + +define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { +; SSE2-LABEL: vsel_double8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm4 +; SSE2-NEXT: movsd %xmm2, %xmm6 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm7, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_double8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm4 +; SSSE3-NEXT: movsd %xmm2, %xmm6 +; SSSE3-NEXT: movaps %xmm4, %xmm0 +; SSSE3-NEXT: movaps %xmm5, %xmm1 +; SSSE3-NEXT: movaps %xmm6, %xmm2 +; SSSE3-NEXT: movaps %xmm7, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_double8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_double8: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2 + ret <8 x double> %vsel +} + +define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { +; SSE2-LABEL: vsel_i648: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm4 +; SSE2-NEXT: movsd %xmm2, %xmm6 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm7, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i648: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm4 +; SSSE3-NEXT: movsd %xmm2, %xmm6 +; SSSE3-NEXT: movaps %xmm4, %xmm0 +; SSSE3-NEXT: movaps %xmm5, %xmm1 +; SSSE3-NEXT: movaps %xmm6, %xmm2 +; SSSE3-NEXT: movaps %xmm7, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i648: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i648: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i648: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2 + ret <8 x i64> %vsel +} + +define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { +; SSE-LABEL: vsel_double4: +; SSE: # BB#0: # %entry +; SSE-NEXT: movsd %xmm0, %xmm2 +; SSE-NEXT: movsd %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: vsel_double4: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2 + ret <4 x double> %vsel +} + +define <2 x double> @testa(<2 x double> %x, <2 x double> %y) { +; SSE2-LABEL: testa: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: cmplepd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: testa: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movapd %xmm1, %xmm2 +; SSSE3-NEXT: cmplepd %xmm0, %xmm2 +; SSSE3-NEXT: andpd %xmm2, %xmm0 +; SSSE3-NEXT: andnpd %xmm1, %xmm2 +; SSSE3-NEXT: orpd %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testa: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmplepd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testa: +; AVX: # BB#0: # %entry +; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %max_is_x = fcmp oge <2 x double> %x, %y + %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y + ret <2 x double> %max +} + +define <2 x double> @testb(<2 x double> %x, <2 x double> %y) { +; SSE2-LABEL: testb: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: cmpnlepd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: testb: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movapd %xmm1, %xmm2 +; SSSE3-NEXT: cmpnlepd %xmm0, %xmm2 +; SSSE3-NEXT: andpd %xmm2, %xmm0 +; SSSE3-NEXT: andnpd %xmm1, %xmm2 +; SSSE3-NEXT: orpd %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testb: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpnlepd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testb: +; AVX: # BB#0: # %entry +; AVX-NEXT: vcmpnlepd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %min_is_x = fcmp ult <2 x double> %x, %y + %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y + ret <2 x double> %min +} + +; If we can figure out a blend has a constant mask, we should emit the +; blend instruction with an immediate mask +define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { +; SSE-LABEL: constant_blendvpd_avx: +; SSE: # BB#0: # %entry +; SSE-NEXT: movsd %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: constant_blendvpd_avx: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX-NEXT: retq +entry: + %select = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab + ret <4 x double> %select +} + +define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { +; SSE2-LABEL: constant_blendvps_avx: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0] +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295] +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andps %xmm5, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: constant_blendvps_avx: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0] +; SSSE3-NEXT: andps %xmm4, %xmm2 +; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295] +; SSSE3-NEXT: andps %xmm5, %xmm0 +; SSSE3-NEXT: orps %xmm2, %xmm0 +; SSSE3-NEXT: andps %xmm4, %xmm3 +; SSSE3-NEXT: andps %xmm5, %xmm1 +; SSSE3-NEXT: orps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: constant_blendvps_avx: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_blendvps_avx: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX-NEXT: retq +entry: + %select = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd + ret <8 x float> %select +} + +define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { +; SSE2-LABEL: constant_pblendvb_avx2: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andps %xmm5, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: constant_pblendvb_avx2: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; SSSE3-NEXT: andps %xmm4, %xmm2 +; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSSE3-NEXT: andps %xmm5, %xmm0 +; SSSE3-NEXT: orps %xmm2, %xmm0 +; SSSE3-NEXT: andps %xmm4, %xmm3 +; SSSE3-NEXT: andps %xmm5, %xmm1 +; SSSE3-NEXT: orps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: constant_pblendvb_avx2: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: pblendvb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_pblendvb_avx2: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_pblendvb_avx2: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +entry: + %select = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd + ret <32 x i8> %select +} + +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) + +;; 4 tests for shufflevectors that optimize to blend + immediate +define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: blend_shufflevector_4xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xfloat: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_shufflevector_4xfloat: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX-NEXT: retq +entry: + %select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x float> %select +} + +define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) { +; SSE2-LABEL: blend_shufflevector_8xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_8xfloat: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_8xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_shufflevector_8xfloat: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX-NEXT: retq +entry: + %select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15> + ret <8 x float> %select +} + +define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) { +; SSE2-LABEL: blend_shufflevector_4xdouble: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xdouble: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm2 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xdouble: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_shufflevector_4xdouble: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX-NEXT: retq +entry: + %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x double> %select +} + +define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: blend_shufflevector_4xi64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xi64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xi64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: blend_shufflevector_4xi64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_4xi64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq +entry: + %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + ret <4 x i64> %select +} diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index b6d43e9..4b269dc 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -1,221 +1,1255 @@ -; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41 -; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE -; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX +; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41 +; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX -define <4 x i32> @test1(<4 x i32> %a) { - %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> - ret <4 x i32> %div +target triple = "x86_64-unknown-unknown" +define <4 x i32> @test1(<4 x i32> %a) { ; SSE41-LABEL: test1: -; SSE41: pmuludq -; SSE41: pshufd $49 -; SSE41: pmuludq -; SSE41: shufps $-35 -; SSE41: psubd -; SSE41: psrld $1 -; SSE41: padd -; SSE41: psrld $2 - +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm1, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: psubd %xmm2, %xmm0 +; SSE41-NEXT: psrld $1, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 +; SSE41-NEXT: psrld $2, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test1: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: psrld $1, %xmm0 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: psrld $2, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test1: -; AVX: vpmuludq -; AVX: vpshufd $49 -; AVX: vpmuludq -; AVX: vshufps $-35 -; AVX: vpsubd -; AVX: vpsrld $1 -; AVX: vpadd -; AVX: vpsrld $2 +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> + ret <4 x i32> %div } define <8 x i32> @test2(<8 x i32> %a) { +; SSE41-LABEL: test2: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: psrld $1, %xmm0 +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: psrld $2, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: psrld $1, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: psrld $2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test2: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: psrld $1, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: psrld $2, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm1 +; SSE-NEXT: psrld $1, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: psrld $2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $2, %ymm0, %ymm0 +; AVX-NEXT: retq %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %div - -; AVX-LABEL: test2: -; AVX: vpbroadcastd -; AVX: vpalignr $4 -; AVX: vpmuludq -; AVX: vpmuludq -; AVX: vpblendd $170 -; AVX: vpsubd -; AVX: vpsrld $1 -; AVX: vpadd -; AVX: vpsrld $2 } define <8 x i16> @test3(<8 x i16> %a) { - %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> - ret <8 x i16> %div - ; SSE41-LABEL: test3: -; SSE41: pmulhuw -; SSE41: psubw -; SSE41: psrlw $1 -; SSE41: paddw -; SSE41: psrlw $2 - +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: psubw %xmm1, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test3: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psrlw $2, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test3: -; AVX: vpmulhuw -; AVX: vpsubw -; AVX: vpsrlw $1 -; AVX: vpaddw -; AVX: vpsrlw $2 +; AVX: # BB#0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> + ret <8 x i16> %div } define <16 x i16> @test4(<16 x i16> %a) { +; SSE41-LABEL: test4: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmulhuw %xmm2, %xmm3 +; SSE41-NEXT: psubw %xmm3, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm0 +; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: pmulhuw %xmm1, %xmm2 +; SSE41-NEXT: psubw %xmm2, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test4: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmulhuw %xmm2, %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm0 +; SSE-NEXT: psrlw $2, %xmm0 +; SSE-NEXT: pmulhuw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm1 +; SSE-NEXT: psrlw $1, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: psrlw $2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test4: +; AVX: # BB#0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 +; AVX-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX-NEXT: retq %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7> ret <16 x i16> %div - -; AVX-LABEL: test4: -; AVX: vpmulhuw -; AVX: vpsubw -; AVX: vpsrlw $1 -; AVX: vpaddw -; AVX: vpsrlw $2 -; AVX-NOT: vpmulhuw } define <8 x i16> @test5(<8 x i16> %a) { - %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> - ret <8 x i16> %div - ; SSE41-LABEL: test5: -; SSE41: pmulhw -; SSE41: psrlw $15 -; SSE41: psraw $1 -; SSE41: paddw - +; SSE41: # BB#0: +; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $15, %xmm1 +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test5: +; SSE: # BB#0: +; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $15, %xmm1 +; SSE-NEXT: psraw $1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test5: -; AVX: vpmulhw -; AVX: vpsrlw $15 -; AVX: vpsraw $1 -; AVX: vpaddw +; AVX: # BB#0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> + ret <8 x i16> %div } define <16 x i16> @test6(<16 x i16> %a) { +; SSE41-LABEL: test6: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; SSE41-NEXT: pmulhw %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $15, %xmm3 +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm0 +; SSE41-NEXT: pmulhw %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $15, %xmm2 +; SSE41-NEXT: psraw $1, %xmm1 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test6: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; SSE-NEXT: pmulhw %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlw $15, %xmm3 +; SSE-NEXT: psraw $1, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm0 +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $1, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test6: +; AVX: # BB#0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpsrlw $15, %ymm0, %ymm1 +; AVX-NEXT: vpsraw $1, %ymm0, %ymm0 +; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7> ret <16 x i16> %div - -; AVX-LABEL: test6: -; AVX: vpmulhw -; AVX: vpsrlw $15 -; AVX: vpsraw $1 -; AVX: vpaddw -; AVX-NOT: vpmulhw } define <16 x i8> @test7(<16 x i8> %a) { - %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> - ret <16 x i8> %div - -; FIXME: scalarized ; SSE41-LABEL: test7: -; SSE41: pext +; SSE41: # BB#0: +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %ecx +; SSE41-NEXT: movsbl %cl, %ecx +; SSE41-NEXT: imull $-109, %ecx, %edx +; SSE41-NEXT: shrl $8, %edx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: movb %cl, %dl +; SSE41-NEXT: shrb $7, %dl +; SSE41-NEXT: sarb $2, %cl +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test7: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm4 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq +; ; AVX-LABEL: test7: -; AVX: pext +; AVX: # BB#0: +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: movsbl %al, %eax +; AVX-NEXT: imull $-109, %eax, %ecx +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: movb %al, %cl +; AVX-NEXT: shrb $7, %cl +; AVX-NEXT: sarb $2, %al +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %dl +; AVX-NEXT: shrb $7, %dl +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> + ret <16 x i8> %div } define <4 x i32> @test8(<4 x i32> %a) { - %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> - ret <4 x i32> %div - ; SSE41-LABEL: test8: -; SSE41: pmuldq -; SSE41: pshufd $49 -; SSE41-NOT: pshufd $49 -; SSE41: pmuldq -; SSE41: shufps $-35 -; SSE41: pshufd $-40 -; SSE41: padd -; SSE41: psrld $31 -; SSE41: psrad $2 -; SSE41: padd - +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $31, %xmm0 +; SSE41-NEXT: psrad $2, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; ; SSE-LABEL: test8: -; SSE: psrad $31 -; SSE: pand -; SSE: paddd -; SSE: pmuludq -; SSE: pshufd $49 -; SSE-NOT: pshufd $49 -; SSE: pmuludq -; SSE: shufps $-35 -; SSE: pshufd $-40 -; SSE: psubd -; SSE: padd -; SSE: psrld $31 -; SSE: psrad $2 -; SSE: padd - +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $31, %xmm0 +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test8: -; AVX: vpmuldq -; AVX: vpshufd $49 -; AVX-NOT: vpshufd $49 -; AVX: vpmuldq -; AVX: vshufps $-35 -; AVX: vpshufd $-40 -; AVX: vpadd -; AVX: vpsrld $31 -; AVX: vpsrad $2 -; AVX: vpadd +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vpmuldq %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 +; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> + ret <4 x i32> %div } define <8 x i32> @test9(<8 x i32> %a) { +; SSE41-LABEL: test9: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: # kill: XMM0<def> XMM3<kill> +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld $31, %xmm3 +; SSE41-NEXT: psrad $2, %xmm0 +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrld $31, %xmm2 +; SSE41-NEXT: psrad $2, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test9: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrad $31, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: paddd %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: psubd %xmm5, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld $31, %xmm3 +; SSE-NEXT: psrad $2, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $31, %xmm2 +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test9: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 +; AVX-NEXT: vpsrad $2, %ymm0, %ymm0 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %div - -; AVX-LABEL: test9: -; AVX: vpalignr $4 -; AVX: vpbroadcastd -; AVX: vpmuldq -; AVX: vpmuldq -; AVX: vpblendd $170 -; AVX: vpadd -; AVX: vpsrld $31 -; AVX: vpsrad $2 -; AVX: vpadd } define <8 x i32> @test10(<8 x i32> %a) { +; SSE41-LABEL: test10: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psubd %xmm3, %xmm5 +; SSE41-NEXT: psrld $1, %xmm5 +; SSE41-NEXT: paddd %xmm3, %xmm5 +; SSE41-NEXT: psrld $2, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7] +; SSE41-NEXT: pmulld %xmm3, %xmm5 +; SSE41-NEXT: psubd %xmm5, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psubd %xmm2, %xmm4 +; SSE41-NEXT: psrld $1, %xmm4 +; SSE41-NEXT: paddd %xmm2, %xmm4 +; SSE41-NEXT: psrld $2, %xmm4 +; SSE41-NEXT: pmulld %xmm3, %xmm4 +; SSE41-NEXT: psubd %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test10: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psubd %xmm3, %xmm5 +; SSE-NEXT: psrld $1, %xmm5 +; SSE-NEXT: paddd %xmm3, %xmm5 +; SSE-NEXT: psrld $2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: psubd %xmm5, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psubd %xmm2, %xmm4 +; SSE-NEXT: psrld $1, %xmm4 +; SSE-NEXT: paddd %xmm2, %xmm4 +; SSE-NEXT: psrld $2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: psubd %xmm4, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test10: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vpsrld $2, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %rem - -; AVX-LABEL: test10: -; AVX: vpbroadcastd -; AVX: vpalignr $4 -; AVX: vpmuludq -; AVX: vpmuludq -; AVX: vpblendd $170 -; AVX: vpsubd -; AVX: vpsrld $1 -; AVX: vpadd -; AVX: vpsrld $2 -; AVX: vpmulld } define <8 x i32> @test11(<8 x i32> %a) { +; SSE41-LABEL: test11: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: paddd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrld $31, %xmm5 +; SSE41-NEXT: psrad $2, %xmm3 +; SSE41-NEXT: paddd %xmm5, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7] +; SSE41-NEXT: pmulld %xmm5, %xmm3 +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld $31, %xmm3 +; SSE41-NEXT: psrad $2, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: pmulld %xmm5, %xmm2 +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test11: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: paddd %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pmuludq %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: psubd %xmm6, %xmm7 +; SSE-NEXT: paddd %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrld $31, %xmm4 +; SSE-NEXT: psrad $2, %xmm7 +; SSE-NEXT: paddd %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm7 +; SSE-NEXT: pmuludq %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: psubd %xmm7, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: paddd %xmm3, %xmm6 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $31, %xmm3 +; SSE-NEXT: psrad $2, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test11: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 +; AVX-NEXT: vpsrad $2, %ymm1, %ymm1 +; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %rem - -; AVX-LABEL: test11: -; AVX: vpalignr $4 -; AVX: vpbroadcastd -; AVX: vpmuldq -; AVX: vpmuldq -; AVX: vpblendd $170 -; AVX: vpadd -; AVX: vpsrld $31 -; AVX: vpsrad $2 -; AVX: vpadd -; AVX: vpmulld } define <2 x i16> @test12() { +; SSE41-LABEL: test12: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test12: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test12: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0 %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1 %B9 = urem <2 x i16> %I9, %I9 ret <2 x i16> %B9 +} -; AVX-LABEL: test12: -; AVX: xorps +define <4 x i32> @PR20355(<4 x i32> %a) { +; SSE41-LABEL: PR20355: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: psrld $31, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: PR20355: +; SSE: # BB#0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $31, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: PR20355: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %sdiv } diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll new file mode 100644 index 0000000..7a329d7 --- /dev/null +++ b/test/CodeGen/X86/vector-sext.ll @@ -0,0 +1,943 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; +; Just one 32-bit run to make sure we do reasonable things there. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41 + +define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: # kill: XMM0<def> XMM1<kill> +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: # kill: XMM0<def> XMM1<kill> +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 +; SSE41-NEXT: pslld $16, %xmm0 +; SSE41-NEXT: psrad $16, %xmm0 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pslld $16, %xmm1 +; SSE41-NEXT: psrad $16, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_8i32: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0 +; X32-SSE41-NEXT: pslld $16, %xmm0 +; X32-SSE41-NEXT: psrad $16, %xmm0 +; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE41-NEXT: pslld $16, %xmm1 +; X32-SSE41-NEXT: psrad $16, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_4i32_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i32_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i32_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i32_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i32_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i32_to_4i64: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm1, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_test1: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test1: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test1: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test1: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test1: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i32> + ret <4 x i32>%Y +} + +define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_test2: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test2: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test2: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test2: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test2: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i32> + ret <4 x i32>%Y +} + +define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { +; SSE2-LABEL: load_sext_test3: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsbq 1(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movsbq (%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test3: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsbq 1(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movsbq (%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test3: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test3: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test3: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i8>* %ptr + %Y = sext <2 x i8> %X to <2 x i64> + ret <2 x i64>%Y +} + +define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { +; SSE2-LABEL: load_sext_test4: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movswq 2(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movswq (%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test4: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movswq 2(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movswq (%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test4: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test4: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test4: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i16>* %ptr + %Y = sext <2 x i16> %X to <2 x i64> + ret <2 x i64>%Y +} + +define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { +; SSE2-LABEL: load_sext_test5: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movslq 4(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movslq (%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test5: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movslq 4(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movslq (%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test5: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test5: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test5: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i32>* %ptr + %Y = sext <2 x i32> %X to <2 x i64> + ret <2 x i64>%Y +} + +define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_test6: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test6: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test6: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test6: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test6: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i16> + ret <8 x i16>%Y +} + +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { +; SSE2-LABEL: sext_4i1_to_4i64: +; SSE2: # BB#0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i1_to_4i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i1_to_4i64: +; SSE41: # BB#0: +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i1_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i1_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i1_to_4i64: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: pslld $31, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm1, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i1> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: sext_16i8_to_16i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_16i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_16i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: psraw $8, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_16i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_16i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_16i16: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movdqa (%eax), %xmm1 +; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0 +; X32-SSE41-NEXT: psllw $8, %xmm0 +; X32-SSE41-NEXT: psraw $8, %xmm0 +; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE41-NEXT: psllw $8, %xmm1 +; X32-SSE41-NEXT: psraw $8, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i8>* %ptr + %Y = sext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { +; SSE2-LABEL: sext_4i8_to_4i64: +; SSE2: # BB#0: +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i8_to_4i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslld $24, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i8_to_4i64: +; SSE41: # BB#0: +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: psrad $24, %xmm0 +; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i8_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i8_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i8_to_4i64: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: pslld $24, %xmm0 +; X32-SSE41-NEXT: psrad $24, %xmm0 +; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm1, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i8> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i8_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i8_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i64: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: movd %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i64> + ret <4 x i64>%Y +} + +define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_4i16_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq (%rdi), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i16_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movq (%rdi), %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i16_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movq (%rdi), %xmm0 +; SSE41-NEXT: pmovzxwd %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i16_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i16_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i16_to_4i64: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movsd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: movd %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i64> + ret <4 x i64>%Y +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4da7e42..30ad366 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1,196 +1,1110 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME: SSE2 should look like the following: +; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; FIXME: # BB#0: +; FIXME-NEXT: punpcklbw %xmm0, %xmm0 +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME: SSE2 should be the following: +; FIXME-LABEL: @shuffle_v16i8_0101010101010101 +; FIXME: # BB#0: +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; +; SSE2-LABEL: shuffle_v16i8_0101010101010101: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_0101010101010101: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_0101010101010101: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v16i8_0101010101010101: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_0101010101010101: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23 -; CHECK-SSE2: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> ret <16 x i8> %shuffle } +define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: +; SSE: # BB#0: +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i8> %shuffle +} + define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1] -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] +; SSE2-NEXT: movsd %xmm4, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20> ret <16 x i8> %shuffle } -define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 - %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> +define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { +; SSE2-LABEL: trunc_v4i32_shuffle: +; SSE2: # BB#0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_v4i32_shuffle: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_v4i32_shuffle: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc_v4i32_shuffle: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) { +; We don't have anything useful to check here. This generates 100s of +; instructions. Instead, just make sure we survived codegen. +; ALL-LABEL: stress_test0: +; ALL: retq +entry: + %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6> + %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28> + %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8> + %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29> + %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29> + %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17> + %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23> + %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17> + %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> + %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10> + ret <16 x i8> %s.16.0 +} + +define <16 x i8> @stress_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind { +; There is nothing interesting to check about these instructions other than +; that they survive codegen. However, we actually do better and delete all of +; them because the result is 'undef'. +; +; ALL-LABEL: stress_test1: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0> + %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22> + %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9> + %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11> + %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29> + %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef> + %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10> + %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef> + %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5> + %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef> + + ret <16 x i8> %s.12.4 +} + +define <16 x i8> @PR20540(<8 x i8> %a) { +; SSE2-LABEL: PR20540: +; SSE2: # BB#0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR20540: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR20540: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: PR20540: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE2: # BB#0: +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSE2: # BB#0: +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> ret <16 x i8> %shuffle } -define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 +define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31> ret <16 x i8> %shuffle } -define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pand -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq - %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> +define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: packuswb %xmm0, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX-NEXT: retq +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0> + + ret <16 x i8> %shuffle +} + +define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) { +; Nothing interesting to test here. Just make sure we didn't crashe. +; ALL-LABEL: stress_test2: +; ALL: retq +entry: + %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5> + %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22> + %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19> + + ret <16 x i8> %s.2.0 +} + +define void @constant_gets_selected() { +; ALL-LABEL: constant_gets_selected: +; ALL-NOT movd $0, {{%xmm[0-9]+}} + %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8> + %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27> + %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32> + store <4 x i32> %weirder_zero, <4 x i32>* undef, align 16 + store <4 x i32> zeroinitializer, <4 x i32>* undef, align 16 + ret void +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 78b4ee7..9affee9 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1,219 +1,1138 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_00 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_00: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_10 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_10: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_10: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_11 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_11: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_11: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_22 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_22: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_22: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_22: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_32 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_32: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_32: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_33 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_33: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_33: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3> ret <2 x i64> %shuffle } define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_00 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2f64_00: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_00: +; SSE3: # BB#0: +; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_00: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_00: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_10 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_10: +; SSE: # BB#0: +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_10: +; AVX: # BB#0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_11 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_11: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_11: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) { -; FIXME: Should these use movapd + shufpd to remove a domain change at the cost -; of a mov? +; SSE2-LABEL: shuffle_v2f64_22: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_22: +; SSE3: # BB#0: +; SSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq ; -; CHECK-SSE2-LABEL: @shuffle_v2f64_22 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1] -; CHECK-SSE2-NEXT: retq +; SSSE3-LABEL: shuffle_v2f64_22: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_22: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_22: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_32 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_32: +; SSE: # BB#0: +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_32: +; AVX: # BB#0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_33 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_33: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_33: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3> ret <2 x double> %shuffle } +define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: shuffle_v2f64_03: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_03: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_03: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_03: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_03: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} +define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: shuffle_v2f64_21: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_21: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_21: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_21: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_21: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1> + ret <2 x double> %shuffle +} define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_02 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_02: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_02: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_02_copy: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_02_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_03 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_03: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_03: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_03: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_03: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_03: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_03: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_03_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_03_copy: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_03_copy: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm2 +; SSE3-NEXT: movaps %xmm2, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_03_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm2 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_03_copy: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_03_copy: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_03_copy: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_12 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_12: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_12: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_12: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_12: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_12: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_12_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_12_copy: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_12_copy: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_12_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_12_copy: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_12_copy: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_13 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_13: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_13: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_13_copy: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_13_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_20 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_20: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_20: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_20_copy: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_20_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_21 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_21: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_21: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_21: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_21: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_21: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_21: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_21_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_21_copy: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_21_copy: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm2, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_21_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm2, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_21_copy: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_21_copy: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_21_copy: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_30 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_30: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_30: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_30: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_30: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_30: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_30_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_30_copy: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_30_copy: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE3-NEXT: movapd %xmm2, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_30_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_30_copy: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_30_copy: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_31 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_31: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_31: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_31_copy: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_31_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> ret <2 x i64> %shuffle } + +define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) { +; SSE-LABEL: shuffle_v2i64_0z: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_0z: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) { +; SSE-LABEL: shuffle_v2i64_1z: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_1z: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) { +; SSE-LABEL: shuffle_v2i64_z0: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_z0: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0> + ret <2 x i64> %shuffle +} + +define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) { +; SSE2-LABEL: shuffle_v2i64_z1: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_z1: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_z1: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_z1: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_z1: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_z1: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1> + ret <2 x i64> %shuffle +} + +define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_0z: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_0z: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_1z: +; SSE: # BB#0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_1z: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_z0: +; SSE: # BB#0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_z0: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) { +; SSE2-LABEL: shuffle_v2f64_z1: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_z1: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_z1: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_z1: +; SSE41: # BB#0: +; SSE41-NEXT: xorpd %xmm1, %xmm1 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_z1: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1> + ret <2 x double> %shuffle +} + +define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) { +; SSE-LABEL: insert_reg_and_zero_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movd %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq %rdi, %xmm0 +; AVX-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x double> @insert_reg_and_zero_v2f64(double %a) { +; SSE-LABEL: insert_reg_and_zero_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovsd (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) { +; SSE2-LABEL: insert_reg_lo_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %rdi, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_lo_v2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movd %rdi, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_lo_v2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %rdi, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_lo_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movd %rdi, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_reg_lo_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) { +; SSE2-LABEL: insert_mem_lo_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movlpd (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_mem_lo_v2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movlpd (%rdi), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_mem_lo_v2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movlpd (%rdi), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_mem_lo_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movq (%rdi), %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_mem_lo_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_lo_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) { +; SSE-LABEL: insert_reg_hi_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movd %rdi, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq %rdi, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) { +; SSE-LABEL: insert_mem_hi_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq (%rdi), %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0> + ret <2 x i64> %shuffle +} + +define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) { +; SSE-LABEL: insert_reg_lo_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_lo_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) { +; SSE-LABEL: insert_mem_lo_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movlpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_lo_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) { +; SSE-LABEL: insert_reg_hi_v2f64: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) { +; SSE-LABEL: insert_mem_hi_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movhpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_dup_reg_v2f64(double %a) { +; FIXME: We should match movddup for SSE3 and higher here. +; +; SSE2-LABEL: insert_dup_reg_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_dup_reg_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_dup_reg_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_reg_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_reg_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %shuffle +} +define <2 x double> @insert_dup_mem_v2f64(double* %ptr) { +; SSE2-LABEL: insert_dup_mem_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movsd (%rdi), %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_dup_mem_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: movddup (%rdi), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_dup_mem_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movddup (%rdi), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_mem_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: movddup (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_mem_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) { +; SSE-LABEL: shuffle_mem_v2f64_10: +; SSE: # BB#0: +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_mem_v2f64_10: +; AVX: # BB#0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; AVX-NEXT: retq + %a = load <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0> + ret <2 x double> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 7d496fa..833b822 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1,170 +1,1386 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0001 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,0,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0001: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0001: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0020 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,0,2,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0020: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0020: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_0112: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0112: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> + ret <4 x i32> %shuffle +} define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0300 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,3,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0300: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0300: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_1000 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[1,0,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_1000: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_1000: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_2200 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,2,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_2200: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_2200: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_3330 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[3,3,3,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_3330: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_3330: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_3210 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_3210: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_3210: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_2121: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_2121: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1> + ret <4 x i32> %shuffle +} + define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_0001 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_0001: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0001: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_0020 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,2,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_0020: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0020: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_0300 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,3,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_0300: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0300: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_1000 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[1,0,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_1000: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_1000: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_2200 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[2,2,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_2200: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_2200: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_3330 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[3,3,3,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_3330: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_3330: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_3210 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[3,2,1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_3210: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_3210: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ret <4 x float> %shuffle } +define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: shuffle_v4f32_0011: +; SSE: # BB#0: +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0011: +; AVX: # BB#0: +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: shuffle_v4f32_2233: +; SSE: # BB#0: +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_2233: +; AVX: # BB#0: +; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: shuffle_v4f32_0022: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_0022: +; SSE3: # BB#0: +; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_0022: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_0022: +; SSE41: # BB#0: +; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0022: +; AVX: # BB#0: +; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: shuffle_v4f32_1133: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_1133: +; SSE3: # BB#0: +; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_1133: +; SSSE3: # BB#0: +; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_1133: +; SSE41: # BB#0: +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_1133: +; AVX: # BB#0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + ret <4 x float> %shuffle +} define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0124 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_0124: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_0124: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0124: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0124: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0124: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0142 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,2] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0142: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0142: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0412 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[2,0],xmm0[1,2] -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0412: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0412: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,2] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_4012 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2] -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4012: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4012: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,2] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0145 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0145: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0145: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0451 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,3,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0451: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0451: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_4501 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4501: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4501: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_4015 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0,1,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4015: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4015: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> ret <4 x i32> %shuffle } + +define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_4zzz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_4zzz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_4zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_4zzz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_4zzz: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_z4zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_z4zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_z4zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_z4zz: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_z4zz: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_zz4z: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_zz4z: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_zz4z: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_zz4z: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_zz4z: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_zuu4: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_zuu4: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_zuu4: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_zuu4: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_zuu4: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_zzz7: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_zzz7: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_zzz7: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_zzz7: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_zzz7: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_z6zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_z6zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_z6zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_z6zz: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_z6zz: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_4zzz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_4zzz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_4zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_4zzz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4zzz: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_z4zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_z4zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_z4zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_z4zz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_z4zz: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_zz4z: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_zz4z: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_zz4z: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_zz4z: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_zz4z: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_zuu4: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_zuu4: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_zuu4: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_zuu4: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_zuu4: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_z6zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_z6zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_z6zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_z6zz: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_z6zz: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_7012: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_7012: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_7012: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_7012: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_7012: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_6701: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_6701: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_6701: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_6701: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_6701: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_5670: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_5670: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_5670: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_5670: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_5670: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_1234: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_1234: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_1234: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_1234: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_1234: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_2345: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_2345: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_2345: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_2345: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_2345: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_3456: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_3456: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_3456: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_3456: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_3456: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_0u1u: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_0u1u: +; SSE3: # BB#0: +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0u1u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0u1u: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxdq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0u1u: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_0z1z: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_0z1z: +; SSE3: # BB#0: +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0z1z: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0z1z: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxdq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0z1z: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { +; SSE-LABEL: insert_reg_and_zero_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: retq + %v = insertelement <4 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load i32* %ptr + %v = insertelement <4 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x float> @insert_reg_and_zero_v4f32(float %a) { +; SSE2-LABEL: insert_reg_and_zero_v4f32: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_and_zero_v4f32: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_and_zero_v4f32: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_and_zero_v4f32: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovss (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load float* %ptr + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) { +; SSE2-LABEL: insert_reg_lo_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movd %rdi, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_lo_v4i32: +; SSE3: # BB#0: +; SSE3-NEXT: movd %rdi, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_lo_v4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %rdi, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_lo_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movd %rdi, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_reg_lo_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %a.cast = bitcast i64 %a to <2 x i32> + %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { +; SSE2-LABEL: insert_mem_lo_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movlpd (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_mem_lo_v4i32: +; SSE3: # BB#0: +; SSE3-NEXT: movlpd (%rdi), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_mem_lo_v4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movlpd (%rdi), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_mem_lo_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movq (%rdi), %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_mem_lo_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_lo_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %a = load <2 x i32>* %ptr + %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) { +; SSE-LABEL: insert_reg_hi_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movd %rdi, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovq %rdi, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %a.cast = bitcast i64 %a to <2 x i32> + %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { +; SSE-LABEL: insert_mem_hi_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovq (%rdi), %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %a = load <2 x i32>* %ptr + %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x i32> %shuffle +} + +define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { +; SSE-LABEL: insert_reg_lo_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_lo_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a.cast = bitcast double %a to <2 x float> + %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE-LABEL: insert_mem_lo_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movlpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_lo_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) { +; SSE-LABEL: insert_reg_hi_v4f32: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %a.cast = bitcast double %a to <2 x float> + %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE-LABEL: insert_mem_hi_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movhpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { +; SSE-LABEL: shuffle_mem_v4f32_3210: +; SSE: # BB#0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_mem_v4f32_3210: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; AVX-NEXT: retq + %a = load <4 x float>* %ptr + %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 5d1922a..59af434 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1,493 +1,1941 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_01012323 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,0,1,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_01012323: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_01012323: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_67452301 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_67452301: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_67452301: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_456789AB -; CHECK-SSE2: # BB#0: -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_456789AB: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_456789AB: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_456789AB: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_456789AB: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_00000000 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_00000000: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_00000000: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_00000000: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_00004444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_00004444: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_00004444: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_u0u1u2u3: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u0u1u2u3: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3> + ret <8 x i16> %shuffle +} +define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_u4u5u6u7: +; SSE: # BB#0: +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u4u5u6u7: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7> + ret <8 x i16> %shuffle +} define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_31206745: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_31206745: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_44440000 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_44440000: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_44440000: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_44440000: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_44440000: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_23016745: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23016745: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> + ret <8 x i16> %shuffle +} +define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_23026745: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23026745: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5> + ret <8 x i16> %shuffle +} +define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_23016747: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23016747: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7> + ret <8 x i16> %shuffle +} define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_75643120 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_75643120: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_75643120: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_75643120: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_75643120: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_10545410 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_10545410: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_10545410: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_10545410: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_10545410: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_54105410 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_54105410: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_54105410: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_54105410: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_54105410: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_54101054 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_54101054: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_54101054: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_54101054: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_54101054: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04400440 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_04400440: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_04400440: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_04400440: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04400440: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_40044004 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_40044004: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_40044004: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_40044004: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_40044004: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_26405173 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_26405173: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_26405173: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_26405173: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_26405173: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_20645173 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_20645173: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_20645173: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_20645173: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_20645173: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_26401375 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_26401375: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_26401375: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_26401375: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_26401375: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_66751643: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,3,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_66751643: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_66751643: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_66751643: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_60514754: +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,5,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_60514754: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_60514754: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_60514754: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_00444444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_00444444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_00444444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_00444444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_00444444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_44004444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_44004444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_44004444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_44004444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_44004444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04404444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_04404444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_04404444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_04404444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04404444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04400000 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_04400000: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_04400000: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_04400000: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04400000: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04404567 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_04404567: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04404567: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0X444444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0X444444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0X444444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0X444444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0X444444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_44X04444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_44X04444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_44X04444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_44X04444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_44X04444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_X4404444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_X4404444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_X4404444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_X4404444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_X4404444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0127XXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0127XXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0127XXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0127XXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0127XXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXX4563 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_XXXX4563: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XXXX4563: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XXXX4563: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XXXX4563: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_4563XXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_4563XXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_4563XXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_4563XXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_4563XXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_01274563 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_01274563: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_01274563: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_01274563: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_01274563: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_45630127 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_45630127: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_45630127: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_45630127: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_45630127: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_37102735: +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_37102735: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_37102735: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_37102735: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_08192a3b -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_08192a3b: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_08192a3b: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d2e3f -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_0c1d2e3f: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0c1d2e3f: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_4c5d6e7f -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_4c5d6e7f: +; SSE: # BB#0: +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_4c5d6e7f: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_48596a7b -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_48596a7b: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_48596a7b: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_08196e7f -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_08196e7f: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_08196e7f: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_0c1d6879: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0c1d6879: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_109832ba -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklqdq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_109832ba: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_109832ba: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_8091a2b3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_8091a2b3: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_8091a2b3: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_c4d5e6f7 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_c4d5e6f7: +; SSE: # BB#0: +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_c4d5e6f7: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0213cedf -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklqdq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_0213cedf: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0213cedf: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_443aXXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_443aXXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_443aXXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_443aXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_032dXXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_032dXXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_032dXXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_032dXXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_032dXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } -define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXcXXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7] -; CHECK-SSE2-NEXT: retq +define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_XXXdXXXX: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XXXdXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_012dXXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_012dXXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_012dXXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_012dXXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_012dXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXXcde3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_XXXXcde3: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XXXXcde3: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XXXXcde3: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_XXXXcde3: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_XXXXcde3: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_cde3XXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_cde3XXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_cde3XXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_cde3XXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_cde3XXXX: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_cde3XXXX: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_012dcde3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm3 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: punpckhwd %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklqdq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_012dcde3: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_012dcde3: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_012dcde3: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_012dcde3: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_012dcde3: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_XXX1X579: +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XXX1X579: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XXX1X579: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XXX1X579: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_XX4X8acX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XX4X8acX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XX4X8acX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_8zzzzzzz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_8zzzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_z8zzzzzz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzzzz8zz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zuuzuuz8: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzBzzzzz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_def01234: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_def01234: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_def01234: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_def01234: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_ueuu123u: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_ueuu123u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_ueuu123u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_ueuu123u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_56701234: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_56701234: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_56701234: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_56701234: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u6uu123u: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u6uu123u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u6uu123u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u6uu123u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_uuuu123u: +; SSE2: # BB#0: +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_uuuu123u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_uuuu123u: +; SSE41: # BB#0: +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_uuuu123u: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_bcdef012: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_bcdef012: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_bcdef012: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_bcdef012: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_ucdeuu1u: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_ucdeuu1u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_ucdeuu1u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_34567012: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_34567012: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_34567012: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_34567012: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u456uu1u: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u456uu1u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u456uu1u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u456uu1u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u456uuuu: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u456uuuu: +; SSSE3: # BB#0: +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u456uuuu: +; SSE41: # BB#0: +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u456uuuu: +; AVX: # BB#0: +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_3456789a: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_3456789a: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_3456789a: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_3456789a: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u456uu9u: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u456uu9u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u456uu9u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u456uu9u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_56789abc: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_56789abc: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_56789abc: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_56789abc: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u6uu9abu: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u6uu9abu: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u6uu9abu: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u6uu9abu: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0uuu1uuu: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0uuu1uuu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0uuu1uuu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0zzz1zzz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0zzz1zzz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0zzz1zzz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0u1u2u3u: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u: +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0u1u2u3u: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0u1u2u3u: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0z1z2z3z: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0z1z2z3z: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0z1z2z3z: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll new file mode 100644 index 0000000..4db0280 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -0,0 +1,1267 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12> + ret <16 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll new file mode 100644 index 0000000..79c906b --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -0,0 +1,1562 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: movl $128, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vinserti128 $0, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,15,14,13,12,11,10,9,8] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u,31,30,29,28,27,26,25,24] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15] +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0> +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39> + ret <32 x i8> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll new file mode 100644 index 0000000..0bd1bd9 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -0,0 +1,748 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0000: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0001: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0001: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0020: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0020: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0300: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0300: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_1000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_1000: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_2200: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_2200: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_3330: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3330: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_3210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0023: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0022: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1032: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1133: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1023: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1022: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0423: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0423: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0462: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0426: +; ALL: # BB#0: +; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1537: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_4062: +; ALL: # BB#0: +; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_5173: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_5163: +; ALL: # BB#0: +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0527: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_4163: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0167: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x double> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0000: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0001: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0001: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0020: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0020: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0112: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0112: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0300: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0300: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1000: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_2200: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_2200: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_3330: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_3330: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_3210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_3210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0124: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0124: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0142: +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0142: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0412: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0412: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_4012: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_4012: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0451: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0451: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_4015: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_4015: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_2u35: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_2u35: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1251: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1251: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: stress_test1: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: stress_test1: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[3,1,1,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: retq + %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0> + %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef> + %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef> + %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0> + + ret <4 x i64> %f +} + +define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { +; AVX1-LABEL: insert_reg_and_zero_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_and_zero_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: retq + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i64> %shuffle +} + +define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { +; AVX1-LABEL: insert_mem_and_zero_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i64> %shuffle +} + +define <4 x double> @insert_reg_and_zero_v4f64(double %a) { +; ALL-LABEL: insert_reg_and_zero_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: retq + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { +; ALL-LABEL: insert_mem_and_zero_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vmovsd (%rdi), %xmm0 +; ALL-NEXT: retq + %a = load double* %ptr + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @splat_mem_v4f64(double* %ptr) { +; ALL-LABEL: splat_mem_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq + %a = load double* %ptr + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x i64> @splat_mem_v4i64(i64* %ptr) { +; AVX1-LABEL: splat_mem_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup (%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x double> @splat_mem_v4f64_2(double* %p) { +; ALL-LABEL: splat_mem_v4f64_2: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq + %1 = load double* %p + %2 = insertelement <2 x double> undef, double %1, i32 0 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %3 +} + +define <4 x double> @splat_v4f64(<2 x double> %r) { +; AVX1-LABEL: splat_v4f64: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_v4f64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %1 +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll new file mode 100644 index 0000000..ded8232 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -0,0 +1,1931 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00000010: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00000010: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00000200: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00000200: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00003000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00003000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00040000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00040000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00500000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00500000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_06000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_06000000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_70000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_70000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: movl $7, %eax +; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_01014545: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00112233: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00112233: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00001111: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00001111: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_81a3c5e7: +; ALL: # BB#0: +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_08080808: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08080808: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_08084c4c: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_08194c5d: +; ALL: # BB#0: +; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_2a3b6e7f: +; ALL: # BB#0: +; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_08192a3b: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08192a3b: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_08991abb: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08991abb: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_091b2d3f: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_091b2d3f: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_09ab1def: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_09ab1def: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00014445: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00204464: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_03004744: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10005444: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_22006644: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_33307774: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_32107654: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00234467: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00224466: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10325476: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_11335577: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10235467: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10225466: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00015444: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00204644: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_03004474: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10004444: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_22006446: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_33307474: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_32104567: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00236744: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00226644: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10324567: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_11334567: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_01235467: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_01235466: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_002u6u44: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00uu66uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_103245uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1133uu67: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_0uu354uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_uuu3uu66: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_c348cda0: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_c348cda0: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_f511235a: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_f511235a: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_32103210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_32103210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_76547654: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_76547654: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_76543210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_76543210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_3210ba98: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_3210ba98: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_3210fedc: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_3210fedc: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_7654fedc: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_7654fedc: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_fedc7654: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_fedc7654: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_ba987654: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_ba987654: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_ba983210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_ba983210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00000010: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00000010: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00000200: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00000200: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00003000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00003000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00040000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00040000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00500000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00500000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_06000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_06000000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_70000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_70000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: movl $7, %eax +; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_01014545: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_01014545: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00112233: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00112233: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00001111: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00001111: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_81a3c5e7: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_81a3c5e7: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08080808: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08080808: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08084c4c: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08084c4c: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_8823cc67: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_8823cc67: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_9832dc76: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_9832dc76: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_9810dc54: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_9810dc54: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08194c5d: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08194c5d: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_2a3b6e7f: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_2a3b6e7f: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08192a3b: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08192a3b: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08991abb: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08991abb: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_091b2d3f: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_091b2d3f: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_09ab1def: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_09ab1def: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00014445: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00014445: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00204464: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00204464: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_03004744: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_03004744: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10005444: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10005444: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_22006644: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_22006644: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_33307774: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_33307774: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_32107654: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_32107654: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00234467: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00234467: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00224466: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00224466: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10325476: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10325476: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_11335577: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_11335577: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10235467: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10235467: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10225466: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10225466: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00015444: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00015444: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00204644: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00204644: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_03004474: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_03004474: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10004444: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10004444: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_22006446: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_22006446: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_33307474: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_33307474: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_32104567: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_32104567: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00236744: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00236744: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00226644: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00226644: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10324567: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10324567: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_11334567: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_11334567: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_01235467: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_01235467: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_01235466: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_01235466: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_002u6u44: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_002u6u44: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00uu66uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00uu66uu: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_103245uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_103245uu: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_1133uu67: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_1133uu67: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_0uu354uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_0uu354uu: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_uuu3uu66: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_uuu3uu66: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_6caa87e5: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_6caa87e5: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_32103210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_32103210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_76547654: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_76547654: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_76543210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_76543210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_3210ba98: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_3210ba98: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_3210fedc: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_3210fedc: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_7654fedc: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_7654fedc: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_fedc7654: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_fedc7654: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_ba987654: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_ba987654: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_ba983210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_ba983210: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x float> @splat_mem_v8f32_2(float* %p) { +; ALL-LABEL: splat_mem_v8f32_2: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss (%rdi), %ymm0 +; ALL-NEXT: retq + %1 = load float* %p + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %3 +} + +define <8 x float> @splat_v8f32(<4 x float> %r) { +; AVX1-LABEL: splat_v8f32: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_v8f32: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %1 +} diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll new file mode 100644 index 0000000..8f87c7c --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -0,0 +1,1429 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW + +target triple = "x86_64-unknown-unknown" + +define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00000000: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00000010: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00000200: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00003000: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00040000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00500000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_06000000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_70000000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_01014545: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00112233: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00001111: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_81a3c5e7: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08080808: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08084c4c: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0] +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08194c5d: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_2a3b6e7f: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08192a3b: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08991abb: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_091b2d3f: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_09ab1def: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00014445: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00204464: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_03004744: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10005444: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_22006644: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_33307774: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_32107654: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00234467: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00224466: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10325476: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_11335577: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10235467: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10225466: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00015444: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00204644: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_03004474: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10004444: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_22006446: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_33307474: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_32104567: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00236744: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00226644: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10324567: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_11334567: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_01235467: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_01235466: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_002u6u44: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00uu66uu: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_103245uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_1133uu67: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_0uu354uu: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_uuu3uu66: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_c348cda0: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm2[0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 +; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_f511235a: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00000000: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00000010: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00000200: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00003000: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00040000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00500000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_06000000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_70000000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_01014545: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00112233: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00001111: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_81a3c5e7: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08080808: +; ALL: # BB#0: +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08084c4c: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,0] +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08194c5d: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_2a3b6e7f: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08192a3b: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08991abb: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_091b2d3f: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_09ab1def: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00014445: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00204464: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_03004744: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10005444: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_22006644: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_33307774: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_32107654: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00234467: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00224466: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10325476: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_11335577: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10235467: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10225466: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00015444: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00204644: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_03004474: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10004444: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_22006446: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_33307474: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_32104567: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00236744: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00226644: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10324567: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_11334567: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_01235467: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_01235466: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_002u6u44: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00uu66uu: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_103245uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_1133uu67: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_0uu354uu: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_uuu3uu66: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_6caa87e5: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5> + ret <8 x i64> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index e60ecb7..22a6749 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1,6 +1,14 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; +; Verify that the DAG combiner correctly folds bitwise operations across +; shuffles, nested shuffles with undef, pairs of nested shuffles, and other +; basic and always-safe patterns. Also test that the DAG combiner will combine +; target-specific shuffle instructions where reasonable. -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) @@ -8,57 +16,72 @@ declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) define <4 x i32> @combine_pshufd1(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd1 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) - %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) +; ALL-LABEL: combine_pshufd1: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) ret <4 x i32> %c } define <4 x i32> @combine_pshufd2(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd2 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) +; ALL-LABEL: combine_pshufd2: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) %c.cast = bitcast <8 x i16> %c to <4 x i32> - %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) ret <4 x i32> %d } define <4 x i32> @combine_pshufd3(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) +; ALL-LABEL: combine_pshufd3: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) %c.cast = bitcast <8 x i16> %c to <4 x i32> - %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) ret <4 x i32> %d } define <4 x i32> @combine_pshufd4(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd4 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) +; SSE-LABEL: combine_pshufd4: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufd4: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; AVX-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) %c.cast = bitcast <8 x i16> %c to <4 x i32> - %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) ret <4 x i32> %d } define <4 x i32> @combine_pshufd5(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd5 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) +; SSE-LABEL: combine_pshufd5: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufd5: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) %c.cast = bitcast <8 x i16> %c to <4 x i32> @@ -67,53 +90,2458 @@ define <4 x i32> @combine_pshufd5(<4 x i32> %a) { } define <4 x i32> @combine_pshufd6(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd6 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd $0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: combine_pshufd6: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufd6: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: retq +entry: %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) ret <4 x i32> %c } define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshuflw1 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) +; ALL-LABEL: combine_pshuflw1: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) ret <8 x i16> %c } define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshuflw2 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: combine_pshuflw2: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) - %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) + %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) ret <8 x i16> %d } define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshuflw3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: combine_pshuflw3: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshuflw3: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; AVX-NEXT: retq +entry: %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) - %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) + %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) ret <8 x i16> %d } define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshufhw1 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: combine_pshufhw1: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufhw1: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX-NEXT: retq +entry: %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) - %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) + %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) ret <8 x i16> %d } +define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test1: +; SSE: # BB#0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test1: +; AVX: # BB#0: +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test2: +; SSE: # BB#0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test2: +; AVX: # BB#0: +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test3: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test3: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test4: +; SSE: # BB#0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test4: +; AVX: # BB#0: +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test5: +; SSE: # BB#0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test5: +; AVX: # BB#0: +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test6: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test6: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + + +; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles +; are not performing a swizzle operations. + +define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test1b: +; SSE2: # BB#0: +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test1b: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test1b: +; SSE41: # BB#0: +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test1b: +; AVX1: # BB#0: +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test1b: +; AVX2: # BB#0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test2b: +; SSE2: # BB#0: +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test2b: +; SSSE3: # BB#0: +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test2b: +; SSE41: # BB#0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test2b: +; AVX1: # BB#0: +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test2b: +; AVX2: # BB#0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test3b: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test3b: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test3b: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test3b: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test3b: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test4b: +; SSE2: # BB#0: +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test4b: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test4b: +; SSE41: # BB#0: +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test4b: +; AVX1: # BB#0: +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test4b: +; AVX2: # BB#0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test5b: +; SSE2: # BB#0: +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test5b: +; SSSE3: # BB#0: +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test5b: +; SSE41: # BB#0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test5b: +; AVX1: # BB#0: +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test5b: +; AVX2: # BB#0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test6b: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test6b: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test6b: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test6b: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test6b: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test1c: +; SSE: # BB#0: +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test1c: +; AVX: # BB#0: +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test2c: +; SSE: # BB#0: +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test2c: +; AVX: # BB#0: +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test3c: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test3c: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test4c: +; SSE: # BB#0: +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test4c: +; AVX: # BB#0: +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test5c: +; SSE: # BB#0: +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test5c: +; AVX: # BB#0: +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test6c: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test6c: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test1: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test1: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test2: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test2: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test3: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test3: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test4: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test4: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test4: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test5: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test5: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test6: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test6: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test7: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test7: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test8: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test8: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test9: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test9: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test10: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test10: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test11: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test11: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test12: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test12: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> + ret <4 x i32> %2 +} + +; The following pair of shuffles is folded into vector %A. +define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { +; ALL-LABEL: combine_nested_undef_test13: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> + ret <4 x i32> %2 +} + +; The following pair of shuffles is folded into vector %B. +define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test14: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test14: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> + ret <4 x i32> %2 +} + + +; Verify that we don't optimize the following cases. We expect more than one shuffle. +; +; FIXME: Many of these already don't make sense, and the rest should stop +; making sense with th enew vector shuffle lowering. Revisit at least testing for +; it. + +define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test15: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test15: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: combine_nested_undef_test16: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test16: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test16: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test16: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test16: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test17: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test17: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test18: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test18: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test19: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test19: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test20: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test20: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test21: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test21: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + + +; Test that we correctly combine shuffles according to rule +; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) + +define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test22: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test22: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test23: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test23: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test24: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test24: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test25: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test25: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test25: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test26: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test26: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test27: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test27: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test27: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test28: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test28: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> + ret <4 x i32> %2 +} + +define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test1: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1: +; SSE41: # BB#0: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test4: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test4: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test5: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test5: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test5: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test5: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test6: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test6: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test6: +; SSE41: # BB#0: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test6: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test7: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test7: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test7: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test7: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test7: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test8: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test8: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test9: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test9: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test10: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test10: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test10: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test10: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test10: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x i32> %2 +} + +define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { +; ALL-LABEL: combine_test11: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test12: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test12: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test12: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test12: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test13: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test13: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test14: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test14: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test15: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test15: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test15: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test15: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { +; ALL-LABEL: combine_test16: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test17: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test17: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test17: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test17: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test18: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test18: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test19: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test19: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test20: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test20: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test20: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test20: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test20: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x i32> %2 +} + + +; Check some negative cases. +; FIXME: Do any of these really make sense? Are they redundant with the above tests? + +define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test1b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1b: +; SSE41: # BB#0: +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1b: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2b: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2b: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2b: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2b: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3b: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3b: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test4b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4b: +; SSE41: # BB#0: +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test4b: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> + ret <4 x float> %2 +} + + +; Verify that we correctly fold shuffles even when we use illegal vector types. + +define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test1c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test1c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test2c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movd (%rsi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test3c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test3c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test4c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test4c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x i8> %2 +} + + +; The following test cases are generated from this C++ code +; +;__m128 blend_01(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<1 ); +; return s; +;} +; +;__m128 blend_02(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; return s; +;} +; +;__m128 blend_123(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<1 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; s = _mm_blend_ps( s, b, 1<<3 ); +; return s; +;} + +; Ideally, we should collapse the following shuffles into a single one. + +define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_01: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_01: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_01: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_01: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_02: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_02: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_02: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_02: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_123: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_123: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_123: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_123: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> + %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle12 +} + +define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_1: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_1: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_2: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_2: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_3: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_3: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> + ret <4 x i32> %2 +} + + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) + +define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test1: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test1: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test1: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test1: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test2: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test2: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test3: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test3: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test4: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test4: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test5: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test5: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test5: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test5: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> + ret <4 x float> %2 +} + + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) + +define <4 x float> @combine_undef_input_test6(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test6: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test7(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test7: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test7: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test7: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test7: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test8(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test8: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test8: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test8: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test8: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test9(<4 x float> %a) { +; SSE-LABEL: combine_undef_input_test9: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test9: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test10(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test10: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test11: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test11: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test11: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test11: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test12: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test12: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test13: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test13: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test14: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test14: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test15: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test15: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test15: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test15: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> + ret <4 x float> %2 +} + + +; Verify that shuffles are canonicalized according to rules: +; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) +; +; This allows to trigger the following combine rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) +; +; As a result, all the shuffle pairs in each function below should be +; combined into a single legal shuffle operation. + +define <4 x float> @combine_undef_input_test16(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test16: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test17(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test17: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test17: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test17: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test18(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test18: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test18: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test18: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test18: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test19(<4 x float> %a) { +; SSE-LABEL: combine_undef_input_test19: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test19: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test20(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test20: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> + ret <4 x float> %2 +} + +; These tests are designed to test the ability to combine away unnecessary +; operations feeding into a shuffle. The AVX cases are the important ones as +; they leverage operations which cannot be done naturally on the entire vector +; and thus are decomposed into multiple smaller operations. + +define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { +; SSE-LABEL: combine_unneeded_subvector1: +; SSE: # BB#0: +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_unneeded_subvector1: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_unneeded_subvector1: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %c +} + +define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { +; SSE-LABEL: combine_unneeded_subvector2: +; SSE: # BB#0: +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_unneeded_subvector2: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_unneeded_subvector2: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq + %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> + ret <8 x i32> %d +} + +define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps1: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps1: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> + ret <4 x float> %d +} + +define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps2: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps2: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> + ret <4 x float> %d +} + +define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps3: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps3: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> + ret <4 x float> %d +} + +define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps4: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps4: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> + ret <4 x float> %d +} diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll new file mode 100644 index 0000000..226deb0 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -0,0 +1,235 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=SSE1 + +target triple = "x86_64-unknown-unknown" + +define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0001: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0020: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0300: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_1000: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_2200: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_3330: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_3210: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0011: +; SSE1: # BB#0: +; SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_2233: +; SSE1: # BB#0: +; SSE1-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0022: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_1133: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_4zzz: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: movss %xmm0, %xmm1 +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_z4zz: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_zz4z: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_zuu4: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_zzz7: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_z6zz: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_reg_and_zero_v4f32(float %a) { +; SSE1-LABEL: insert_reg_and_zero_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: movss %xmm0, %xmm1 +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { +; SSE1-LABEL: insert_mem_and_zero_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: movss (%rdi), %xmm0 +; SSE1-NEXT: retq + %a = load float* %ptr + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE1-LABEL: insert_mem_lo_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: movq (%rdi), %rax +; SSE1-NEXT: movl %eax, {{[-0-9]+}}(%rsp) +; SSE1-NEXT: shrq $32, %rax +; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm1 +; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm2 +; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE1-NEXT: xorps %xmm2, %xmm2 +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE1-LABEL: insert_mem_hi_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: movq (%rdi), %rax +; SSE1-NEXT: movl %eax, {{[-0-9]+}}(%rsp) +; SSE1-NEXT: shrq $32, %rax +; SSE1-NEXT: movl %eax, {{[-0-9]+}}(%rsp) +; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm1 +; SSE1-NEXT: movss {{[-0-9]+}}(%rsp), %xmm2 +; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE1-NEXT: xorps %xmm2, %xmm2 +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE1-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { +; SSE1-LABEL: shuffle_mem_v4f32_3210: +; SSE1: # BB#0: +; SSE1-NEXT: movaps (%rdi), %xmm0 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE1-NEXT: retq + %a = load <4 x float>* %ptr + %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %shuffle +} diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll new file mode 100644 index 0000000..afd7a24 --- /dev/null +++ b/test/CodeGen/X86/vector-zext.ll @@ -0,0 +1,206 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxwd %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0 +; AVX2-NEXT: retq +entry: + %B = zext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_4i32_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i32_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i32_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_4i32_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i32_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +entry: + %B = zext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { +; SSE2-LABEL: zext_8i8_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i8_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i8_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxwd %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i8_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i8_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +entry: + %t = zext <8 x i8> %z to <8 x i32> + ret <8 x i32> %t +} + +; PR17654 +define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { +; SSE2-LABEL: zext_16i8_to_16i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_16i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_16i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxbw %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_16i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_16i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxbw %xmm0, %ymm0 +; AVX2-NEXT: retq +entry: + %t = zext <16 x i8> %z to <16 x i16> + ret <16 x i16> %t +} diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll new file mode 100644 index 0000000..1e52654 --- /dev/null +++ b/test/CodeGen/X86/vectorcall.ll @@ -0,0 +1,93 @@ +; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86 +; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64 + +; Test integer arguments. + +define x86_vectorcallcc i32 @test_int_1() { + ret i32 0 +} + +; CHECK-LABEL: {{^}}test_int_1@@0: +; CHECK: xorl %eax, %eax + +define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) { + ret i32 %a +} + +; X86-LABEL: {{^}}test_int_2@@4: +; X64-LABEL: {{^}}test_int_2@@8: +; CHECK: movl %ecx, %eax + +define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) { + %at = trunc i64 %a to i32 + ret i32 %at +} + +; X86-LABEL: {{^}}test_int_3@@8: +; X64-LABEL: {{^}}test_int_3@@8: +; CHECK: movl %ecx, %eax + +define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) { + %s = add i32 %a, %b + ret i32 %s +} + +; X86-LABEL: {{^}}test_int_4@@8: +; X86: leal (%ecx,%edx), %eax + +; X64-LABEL: {{^}}test_int_4@@16: +; X64: leal (%rcx,%rdx), %eax + +define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) { + ret i32 0 +} +; CHECK-LABEL: {{^}}test_int_5: + +define x86_vectorcallcc double @test_fp_1(double %a, double %b) { + ret double %b +} +; CHECK-LABEL: {{^}}test_fp_1@@16: +; CHECK: movaps %xmm1, %xmm0 + +define x86_vectorcallcc double @test_fp_2( + double, double, double, double, double, double, double %r) { + ret double %r +} +; CHECK-LABEL: {{^}}test_fp_2@@56: +; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0 + +define x86_vectorcallcc {double, double, double, double} @test_fp_3() { + ret {double, double, double, double} + { double 0.0, double 0.0, double 0.0, double 0.0 } +} +; CHECK-LABEL: {{^}}test_fp_3@@0: +; CHECK: xorps %xmm0 +; CHECK: xorps %xmm1 +; CHECK: xorps %xmm2 +; CHECK: xorps %xmm3 + +; FIXME: Returning via x87 isn't compatible, but its hard to structure the +; tablegen any other way. +define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() { + ret {double, double, double, double, double} + { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 } +} +; CHECK-LABEL: {{^}}test_fp_4@@0: +; CHECK: fldz +; CHECK: xorps %xmm0 +; CHECK: xorps %xmm1 +; CHECK: xorps %xmm2 +; CHECK: xorps %xmm3 + +define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) { + ret <16 x i8> %b +} +; CHECK-LABEL: {{^}}test_vec_1@@32: +; CHECK: movaps %xmm1, %xmm0 + +define x86_vectorcallcc <16 x i8> @test_vec_2( + double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) { + ret <16 x i8> %r +} +; CHECK-LABEL: {{^}}test_vec_2@@104: +; CHECK: movaps (%{{[re]}}cx), %xmm0 diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll new file mode 100644 index 0000000..0c0f4bb --- /dev/null +++ b/test/CodeGen/X86/vselect-avx.ll @@ -0,0 +1,85 @@ +; RUN: llc %s -o - -mattr=+avx | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx" + +; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true> +; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought +; we would lower that into a blend where only the high bit is relevant. +; However, since the whole mask is constant, this is simplified incorrectly +; by the generic code, because it was expecting -1 in place of 2147483648. +; +; The problem does not occur without AVX, because vselect of v4i32 is not legal +; nor custom. +; +; <rdar://problem/18675020> + +; CHECK-LABEL: test: +; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535] +; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807] +; CHECK: ret +define void @test(<4 x i16>* %a, <4 x i16>* %b) { +body: + %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127> + %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer + store <4 x i16> %predphi, <4 x i16>* %a, align 8 + store <4 x i16> %predphi42, <4 x i16>* %b, align 8 + ret void +} + +; Improve code coverage. +; +; When shrinking the condition used into the select to match a blend, this +; test case exercises the path where the modified node is not the root +; of the condition. +; +; CHECK-LABEL: test2: +; CHECK: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 +; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]] +; CHECK: vblendvpd [[MASK]] +; CHECK: retq +define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { +bb: + %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198 + %tmp1888 = load double** %arrayidx1928, align 8 + %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> + %tmp1900 = bitcast double* %tmp1888 to <4 x double>* + store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8 + ret void +} + +; For this test, we used to optimized the conditional mask for the blend, i.e., +; we shrunk some of its bits. +; However, this same mask was used in another select (%predphi31) that turned out +; to be optimized into a and. In that case, the conditional mask was wrong. +; +; Make sure that the and is fed by the original mask. +; +; <rdar://problem/18819506> + +; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we +; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise, +; even a faulty pattern would pass! +; +; CHECK-LABEL: test3: +; Compute the original mask. +; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]] +; Shrink the bit of the mask. +; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]] +; Use the shrunk mask in the blend. +; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; Use the original mask in the and. +; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} +; CHECK: retq +define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { + %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3> + %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer + %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12 + %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer + + store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8 + store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8 + ret void +} diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 42cf06a..3bd1dc4 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -3,270 +3,253 @@ ; Verify that we don't emit packed vector shifts instructions if the ; condition used by the vector select is a vector of constants. - define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test1 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test2 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test3 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test4 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: movaps %xmm1, %xmm0 -; CHECK: ret - define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test5: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test5 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test6: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [0,65535,0,65535,0,65535,0,65535] +; CHECK-NEXT: andps %xmm0, %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a ret <8 x i16> %1 } -; CHECK-LABEL: test6 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test7: +; CHECK: # BB#0: +; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test7 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test8: +; CHECK: # BB#0: +; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test8 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test9: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test9 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: movaps %xmm1, %xmm0 -; CHECK-NEXT: ret define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test10: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test10 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test11: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u> +; CHECK-NEXT: andps %xmm2, %xmm0 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test11 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test12: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test12 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test13: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test13 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret ; Fold (vselect (build_vector AllOnes), N1, N2) -> N1 - define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test14: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test14 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: pcmpeq -; CHECK: ret define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test15: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test15 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: pcmpeq -; CHECK: ret ; Fold (vselect (build_vector AllZeros), N1, N2) -> N2 - define <4 x float> @test16(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test16: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 -} -; CHECK-LABEL: test16 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: ret +} define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test17: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test17 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: ret define <4 x float> @test18(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test18: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test18 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test19: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b ret <4 x i32> %1 } -; CHECK-LABEL: test19 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <2 x double> @test20(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test20: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b ret <2 x double> %1 } -; CHECK-LABEL: test20 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test21: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b ret <2 x i64> %1 } -; CHECK-LABEL: test21 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <4 x float> @test22(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test22: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test22 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test23: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b ret <4 x i32> %1 } -; CHECK-LABEL: test23 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <2 x double> @test24(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test24: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b ret <2 x double> %1 } -; CHECK-LABEL: test24 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test25: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b ret <2 x i64> %1 } -; CHECK-LABEL: test25 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) { -; CHECK-LABEL: select_of_shuffles_0 -; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]] -; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]] -; CHECK: subps [[REGB]], [[REGA]] +; CHECK-LABEL: select_of_shuffles_0: +; CHECK: # BB#0: +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: subps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1 @@ -276,3 +259,24 @@ define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x %7 = fsub <4 x float> %3, %6 ret <4 x float> %7 } + +; PR20677 +define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) { +; CHECK-LABEL: select_illegal: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; CHECK-NEXT: movaps %xmm7, 112(%rdi) +; CHECK-NEXT: movaps %xmm6, 96(%rdi) +; CHECK-NEXT: movaps %xmm5, 80(%rdi) +; CHECK-NEXT: movaps %xmm4, 64(%rdi) +; CHECK-NEXT: movaps %xmm3, 48(%rdi) +; CHECK-NEXT: movaps %xmm2, 32(%rdi) +; CHECK-NEXT: movaps %xmm1, 16(%rdi) +; CHECK-NEXT: movaps %xmm0, (%rdi) +; CHECK-NEXT: retq + %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b + ret <16 x double> %sel +} diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll index d115929..e0b861f 100644 --- a/test/CodeGen/X86/widen_cast-1.ll +++ b/test/CodeGen/X86/widen_cast-1.ll @@ -2,12 +2,12 @@ ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; CHECK: movl -; CHECK: paddd +; CHECK: paddw ; CHECK: movlpd ; Scheduler causes produce a different instruction order ; ATOM: movl -; ATOM: paddd +; ATOM: paddw ; ATOM: movlpd ; bitcast a v4i16 to v2i32 diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll index 9f6778c..3f54ab6 100644 --- a/test/CodeGen/X86/widen_conv-1.ll +++ b/test/CodeGen/X86/widen_conv-1.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -; CHECK: paddq +; CHECK: paddd ; truncate v2i64 to v2i32 diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll index 522ab47..8e5174f 100644 --- a/test/CodeGen/X86/widen_conversions.ll +++ b/test/CodeGen/X86/widen_conversions.ll @@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) { ; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]] ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]] ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] -; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] +; CHECK-NEXT: punpcklwd %[[Z]], %[[X]] ; CHECK-NEXT: ret %val = load <4 x i8>* %ptr diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index 41bea85..0ec3574 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -4,12 +4,12 @@ ; %i32vec3 = type <3 x i32> -; CHECK: add3i32 define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { -; CHECK: movdqa -; CHECK: paddd -; CHECK: pextrd -; CHECK: movq +; CHECK-LABEL: add3i32: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: paddd (%{{.*}}), %[[R0]] +; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}}) +; CHECK-NEXT: movq %[[R0]], (%{{.*}}) %a = load %i32vec3* %ap, align 16 %b = load %i32vec3* %bp, align 16 %x = add %i32vec3 %a, %b @@ -17,15 +17,15 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { ret void } -; CHECK: add3i32_2 define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { -; CHECK: movq -; CHECK: pinsrd -; CHECK: movq -; CHECK: pinsrd -; CHECK: paddd -; CHECK: pextrd -; CHECK: movq +; CHECK-LABEL: add3i32_2: +; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]] +; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]] +; CHECK-NEXT: paddd %[[R0]], %[[R1]] +; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}}) +; CHECK-NEXT: movq %[[R1]], (%{{.*}}) %a = load %i32vec3* %ap, align 8 %b = load %i32vec3* %bp, align 8 %x = add %i32vec3 %a, %b @@ -34,15 +34,15 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { } %i32vec7 = type <7 x i32> -; CHECK: add7i32 define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddd -; CHECK: paddd -; CHECK: pextrd -; CHECK: movq -; CHECK: movdqa +; CHECK-LABEL: add7i32: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddd (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}}) +; CHECK-NEXT: movq %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i32vec7* %ap, align 16 %b = load %i32vec7* %bp, align 16 %x = add %i32vec7 %a, %b @@ -50,18 +50,18 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { ret void } -; CHECK: add12i32 %i32vec12 = type <12 x i32> define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddd -; CHECK: paddd -; CHECK: paddd -; CHECK: movdqa -; CHECK: movdqa -; CHECK: movdqa +; CHECK-LABEL: add12i32: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]] +; CHECK-NEXT: paddd (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]] +; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}}) +; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i32vec12* %ap, align 16 %b = load %i32vec12* %bp, align 16 %x = add %i32vec12 %a, %b @@ -70,11 +70,17 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { } -; CHECK: add3i16 %i16vec3 = type <3 x i16> define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { -; CHECK: paddd -; CHECK: ret +; CHECK-LABEL: add3i16: +; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddd %[[R0]], %[[R1]] +; CHECK-NEXT: movdqa %[[R1]], %[[R0]] +; CHECK-NEXT: pshufb {{.*}}, %[[R0]] +; CHECK-NEXT: pmovzxdq %[[R0]], %[[R0]] +; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}}) +; CHECK-NEXT: movd %[[R0]], (%{{.*}}) %a = load %i16vec3* %ap, align 16 %b = load %i16vec3* %bp, align 16 %x = add %i16vec3 %a, %b @@ -82,11 +88,13 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp ret void } -; CHECK: add4i16 %i16vec4 = type <4 x i16> define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { -; CHECK: paddd -; CHECK: movq +; CHECK-LABEL: add4i16: +; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddw %[[R0]], %[[R1]] +; CHECK-NEXT: movq %[[R1]], (%{{.*}}) %a = load %i16vec4* %ap, align 16 %b = load %i16vec4* %bp, align 16 %x = add %i16vec4 %a, %b @@ -94,15 +102,15 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp ret void } -; CHECK: add12i16 %i16vec12 = type <12 x i16> define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddw -; CHECK: paddw -; CHECK: movq -; CHECK: movdqa +; CHECK-LABEL: add12i16: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddw (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: movq %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i16vec12* %ap, align 16 %b = load %i16vec12* %bp, align 16 %x = add %i16vec12 %a, %b @@ -110,18 +118,18 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* ret void } -; CHECK: add18i16 %i16vec18 = type <18 x i16> define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddw -; CHECK: paddw -; CHECK: paddw -; CHECK: movd -; CHECK: movdqa -; CHECK: movdqa +; CHECK-LABEL: add18i16: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]] +; CHECK-NEXT: paddw (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]] +; CHECK-NEXT: movd %[[R2]], 32(%{{.*}}) +; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i16vec18* %ap, align 16 %b = load %i16vec18* %bp, align 16 %x = add %i16vec18 %a, %b @@ -130,11 +138,18 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* } -; CHECK: add3i8 %i8vec3 = type <3 x i8> define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { -; CHECK: paddd -; CHECK: ret +; CHECK-LABEL: add3i8: +; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddd %[[R0]], %[[R1]] +; CHECK-NEXT: movdqa %[[R1]], %[[R0]] +; CHECK-NEXT: pshufb {{.*}}, %[[R0]] +; CHECK-NEXT: pmovzxwq %[[R0]], %[[R0]] +; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}}) +; CHECK-NEXT: movd %[[R0]], %e[[R2:[abcd]]]x +; CHECK-NEXT: movw %[[R2]]x, (%{{.*}}) %a = load %i8vec3* %ap, align 16 %b = load %i8vec3* %bp, align 16 %x = add %i8vec3 %a, %b @@ -142,17 +157,18 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no ret void } -; CHECK-LABEL: add31i8: %i8vec31 = type <31 x i8> define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddb -; CHECK: paddb -; CHECK: pextrb -; CHECK: pextrw -; CHECK: movq -; CHECK: ret +; CHECK-LABEL: add31i8: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddb (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}}) +; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}}) +; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}}) +; CHECK-NEXT: movq %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i8vec31* %ap, align 16 %b = load %i8vec31* %bp, align 16 %x = add %i8vec31 %a, %b @@ -161,14 +177,43 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp } -; CHECK: rot %i8vec3pack = type { <3 x i8>, i8 } -define %i8vec3pack @rot() nounwind { -; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}} +define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { +; CHECK-LABEL: rot: +; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]] +; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]] +; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]] +; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]] +; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]]) +; CHECK-NEXT: movb $-98, 2(%[[PTR0]]) +; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]] +; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]] +; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]] +; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x +; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) +; CHECK-NEXT: movb $1, 2(%[[PTR1]]) +; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] +; CHECK-NEXT: pand {{.*}}, %[[X0]] +; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: shrl %e[[R0]]x +; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x +; CHECK-NEXT: shrl %e[[R1]]x +; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]] +; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: shrl %e[[R0]]x +; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: movdqa %[[X1]], %[[X2:xmm[0-9]+]] +; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X2]] +; CHECK-NEXT: pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]] +; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}}) +; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x +; CHECK-NEXT: movw %[[R0]]x, (%{{.*}}) + entry: - %X = alloca %i8vec3pack, align 4 - %rot = alloca %i8vec3pack, align 4 - %result = alloca %i8vec3pack, align 4 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>* store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>* @@ -180,7 +225,6 @@ entry: %shr = lshr <3 x i8> %extractVec, %extractVec3 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>* store <3 x i8> %shr, <3 x i8>* %storetmp4 - %tmp5 = load %i8vec3pack* %result - ret %i8vec3pack %tmp5 + ret void } diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll index a355b75..70fdbb7 100644 --- a/test/CodeGen/X86/widen_shuffle-1.ll +++ b/test/CodeGen/X86/widen_shuffle-1.ll @@ -1,43 +1,56 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s +target triple = "x86_64-unknown-unknown" + ; widening shuffle v3float and then a add define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind { -entry: ; CHECK-LABEL: shuf: -; CHECK: extractps -; CHECK: extractps +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: extractps $2, %xmm0, 8(%eax) +; CHECK-NEXT: extractps $1, %xmm0, 4(%eax) +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2> %val = fadd <3 x float> %x, %src2 store <3 x float> %val, <3 x float>* %dst.addr ret void -; CHECK: ret } ; widening shuffle v3float with a different mask and then a add define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind { -entry: ; CHECK-LABEL: shuf2: -; CHECK: extractps -; CHECK: extractps +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: extractps $2, %xmm0, 8(%eax) +; CHECK-NEXT: extractps $1, %xmm0, 4(%eax) +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2> %val = fadd <3 x float> %x, %src2 store <3 x float> %val, <3 x float>* %dst.addr ret void -; CHECK: ret } ; Example of when widening a v3float operation causes the DAG to replace a node ; with the operation that we are currently widening, i.e. when replacing ; opA with opB, the DAG will produce new operations with opA. define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind { -entry: ; CHECK-LABEL: shuf3: -; CHECK-NOT: movlhps -; CHECK-NOT: shufps -; CHECK: pshufd +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; CHECK-NEXT: movaps %xmm1, (%eax) +; CHECK-NEXT: retl +entry: %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> + %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>> %tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -45,27 +58,35 @@ entry: %tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> %t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32> %shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19> - %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080> + %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080> %shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst ret void -; CHECK: ret } ; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone { ; CHECK-LABEL: shuf4: -; CHECK-NOT: punpckldq +; CHECK: # BB#0: +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: pshufb %xmm2, %xmm1 +; CHECK-NEXT: pshufb %xmm2, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i8> %vshuf -; CHECK: ret } ; PR11389: another CONCAT_VECTORS case define void @shuf5(<8 x i8>* %p) nounwind { ; CHECK-LABEL: shuf5: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u> +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: retl %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> store <8 x i8> %v, <8 x i8>* %p, align 8 ret void -; CHECK: ret } diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll new file mode 100644 index 0000000..cabd36a --- /dev/null +++ b/test/CodeGen/X86/win32-pic-jumptable.ll @@ -0,0 +1,36 @@ +; RUN: llc < %s -relocation-model=pic | FileCheck %s + +; CHECK: calll L0$pb +; CHECK-NEXT: L0$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: addl LJTI0_0(,%ecx,4), %eax +; CHECK-NEXT: jmpl *%eax + +; CHECK: LJTI0_0: +; CHECK-NEXT: .long LBB0_4-L0$pb +; CHECK-NEXT: .long LBB0_5-L0$pb +; CHECK-NEXT: .long LBB0_6-L0$pb +; CHECK-NEXT: .long LBB0_7-L0$pb + + +target triple = "i686--windows-itanium" +define i32 @f(i64 %x) { +bb0: + switch i64 %x, label %bb5 [ + i64 1, label %bb1 + i64 2, label %bb2 + i64 3, label %bb3 + i64 4, label %bb4 + ] +bb1: + br label %bb5 +bb2: + br label %bb5 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %y = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ] + ret i32 %y +} diff --git a/test/CodeGen/X86/win64_call_epi.ll b/test/CodeGen/X86/win64_call_epi.ll new file mode 100644 index 0000000..bc73ad4 --- /dev/null +++ b/test/CodeGen/X86/win64_call_epi.ll @@ -0,0 +1,65 @@ +; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64 + +declare void @bar() +declare void @baz() +declare i32 @personality(...) + +; Check for 'nop' between the last call and the epilogue. +define void @foo1() { + + invoke void @bar() + to label %normal + unwind label %catch + +normal: + ret void + +catch: + %1 = landingpad { i8*, i32 } personality i32 (...)* @personality cleanup + resume { i8*, i32 } %1 +} +; WIN64-LABEL: foo1: +; WIN64: .seh_proc foo1 +; WIN64: callq bar +; WIN64: nop +; WIN64: addq ${{[0-9]+}}, %rsp +; WIN64: retq +; Check for 'ud2' after noreturn call +; WIN64: callq _Unwind_Resume +; WIN64-NEXT: ud2 +; WIN64: .seh_endproc + + +; Check it still works when blocks are reordered. +@something = global i32 0 +define void @foo2(i1 zeroext %cond ) { + br i1 %cond, label %a, label %b, !prof !0 +a: + call void @bar() + br label %done +b: + call void @baz() + store i32 0, i32* @something + br label %done +done: + ret void +} +!0 = metadata !{metadata !"branch_weights", i32 100, i32 0} +; WIN64-LABEL: foo2: +; WIN64: callq bar +; WIN64: nop +; WIN64: addq ${{[0-9]+}}, %rsp +; WIN64: retq + + +; Check nop is not emitted when call is not adjacent to epilogue. +define i32 @foo3() { + call void @bar() + ret i32 0 +} +; WIN64-LABEL: foo3: +; WIN64: callq bar +; WIN64: xorl +; WIN64-NOT: nop +; WIN64: addq ${{[0-9]+}}, %rsp +; WIN64: retq diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll index 1a51b2a..8d7f201 100644 --- a/test/CodeGen/X86/win64_vararg.ll +++ b/test/CodeGen/X86/win64_vararg.ll @@ -111,3 +111,22 @@ entry: %tmp = va_arg i8** %ap, i32 ret i32 %tmp } + +define void @sret_arg(i32* sret %agg.result, i8* nocapture readnone %format, ...) { +entry: + %ap = alloca i8* + %ap_i8 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap_i8) + %tmp = va_arg i8** %ap, i32 + store i32 %tmp, i32* %agg.result + ret void +} +; CHECK-LABEL: sret_arg: +; CHECK: pushq +; CHECK-DAG: movq %r9, 40(%rsp) +; CHECK-DAG: movq %r8, 32(%rsp) +; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]] +; CHECK: movl %[[tmp]], (%[[sret:[^ ]*]]) +; CHECK: movq %[[sret]], %rax +; CHECK: popq +; CHECK: retq diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll new file mode 100644 index 0000000..e8b853a --- /dev/null +++ b/test/CodeGen/X86/win_cst_pool.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc" + +define double @double() { + ret double 0x0000000000800000 +} +; CHECK: .globl __real@0000000000800000 +; CHECK-NEXT: .section .rdata,"rd",discard,__real@0000000000800000 +; CHECK-NEXT: .align 8 +; CHECK-NEXT: __real@0000000000800000: +; CHECK-NEXT: .quad 8388608 +; CHECK: double: +; CHECK: movsd __real@0000000000800000(%rip), %xmm0 +; CHECK-NEXT: ret + +define <4 x i32> @vec1() { + ret <4 x i32> <i32 3, i32 2, i32 1, i32 0> +} +; CHECK: .globl __xmm@00000000000000010000000200000003 +; CHECK-NEXT: .section .rdata,"rd",discard,__xmm@00000000000000010000000200000003 +; CHECK-NEXT: .align 16 +; CHECK-NEXT: __xmm@00000000000000010000000200000003: +; CHECK-NEXT: .long 3 +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 0 +; CHECK: vec1: +; CHECK: movaps __xmm@00000000000000010000000200000003(%rip), %xmm0 +; CHECK-NEXT: ret + +define <8 x i16> @vec2() { + ret <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0> +} +; CHECK: .globl __xmm@00000001000200030004000500060007 +; CHECK-NEXT: .section .rdata,"rd",discard,__xmm@00000001000200030004000500060007 +; CHECK-NEXT: .align 16 +; CHECK-NEXT: __xmm@00000001000200030004000500060007: +; CHECK-NEXT: .short 7 +; CHECK-NEXT: .short 6 +; CHECK-NEXT: .short 5 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 3 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 0 +; CHECK: vec2: +; CHECK: movaps __xmm@00000001000200030004000500060007(%rip), %xmm0 +; CHECK-NEXT: ret + + +define <4 x float> @undef1() { + ret <4 x float> <float 1.0, float 1.0, float undef, float undef> + +; CHECK: .globl __xmm@00000000000000003f8000003f800000 +; CHECK-NEXT: .section .rdata,"rd",discard,__xmm@00000000000000003f8000003f800000 +; CHECK-NEXT: .align 16 +; CHECK-NEXT: __xmm@00000000000000003f8000003f800000: +; CHECK-NEXT: .long 1065353216 # float 1 +; CHECK-NEXT: .long 1065353216 # float 1 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK: undef1: +; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0 +; CHECK-NEXT: ret +} diff --git a/test/CodeGen/X86/windows-itanium-alloca.ll b/test/CodeGen/X86/windows-itanium-alloca.ll new file mode 100644 index 0000000..0a06cde --- /dev/null +++ b/test/CodeGen/X86/windows-itanium-alloca.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686--windows-itanium" + +declare void @external(i8*) + +define dllexport void @alloca(i32 %sz) { +entry: + %vla = alloca i8, i32 %sz, align 1 + call void @external(i8* %vla) + ret void +} + +; CHECK: __chkstk + diff --git a/test/CodeGen/X86/x32-function_pointer-1.ll b/test/CodeGen/X86/x32-function_pointer-1.ll new file mode 100644 index 0000000..2baf92a --- /dev/null +++ b/test/CodeGen/X86/x32-function_pointer-1.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s + +; Test for x32 function pointer tail call + +@foo1 = external global void (i8*)* +@foo2 = external global void (i8*)* + +define void @bar(i8* %h) nounwind uwtable { +entry: + %0 = load void (i8*)** @foo1, align 4 +; CHECK: movl foo1(%rip), %e{{[^,]*}} + tail call void %0(i8* %h) nounwind +; CHECK: callq *%r{{[^,]*}} + %1 = load void (i8*)** @foo2, align 4 +; CHECK: movl foo2(%rip), %e{{[^,]*}} + tail call void %1(i8* %h) nounwind +; CHECK: jmpq *%r{{[^,]*}} + ret void +} diff --git a/test/CodeGen/X86/x32-function_pointer-2.ll b/test/CodeGen/X86/x32-function_pointer-2.ll new file mode 100644 index 0000000..f727d41 --- /dev/null +++ b/test/CodeGen/X86/x32-function_pointer-2.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s + +; Test call function pointer with function argument +; +; void bar (void * h, void (*foo) (void *)) +; { +; foo (h); +; foo (h); +; } + + +define void @bar(i8* %h, void (i8*)* nocapture %foo) nounwind { +entry: + tail call void %foo(i8* %h) nounwind +; CHECK: mov{{l|q}} %{{e|r}}si, %{{e|r}}[[REG:.*]]{{d?}} +; CHECK: callq *%r[[REG]] + tail call void %foo(i8* %h) nounwind +; CHECK: jmpq *%r{{[^,]*}} + ret void +} diff --git a/test/CodeGen/X86/x32-function_pointer-3.ll b/test/CodeGen/X86/x32-function_pointer-3.ll new file mode 100644 index 0000000..5eaf85d --- /dev/null +++ b/test/CodeGen/X86/x32-function_pointer-3.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s + +; Test calling function pointer passed in struct + +; The fuction argument `h' in + +; struct foo { +; void (*f) (void); +; int i; +; }; +; void +; bar (struct foo h) +; { +; h.f (); +; } + +; is passed in the 64-bit %rdi register. The `f' field is in the lower 32 +; bits of %rdi register and the `i' field is in the upper 32 bits of %rdi +; register. We need to zero-extend %edi to %rdi before branching via %rdi. + +define void @bar(i64 %h.coerce) nounwind { +entry: + %h.sroa.0.0.extract.trunc = trunc i64 %h.coerce to i32 + %0 = inttoptr i32 %h.sroa.0.0.extract.trunc to void ()* +; CHECK: movl %edi, %e[[REG:.*]] + tail call void %0() nounwind +; CHECK: jmpq *%r[[REG]] + ret void +} diff --git a/test/CodeGen/X86/x86-64-call.ll b/test/CodeGen/X86/x86-64-call.ll new file mode 100644 index 0000000..300f8d1 --- /dev/null +++ b/test/CodeGen/X86/x86-64-call.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=generic -mtriple=i686-pc-linux -verify-machineinstrs | FileCheck %s -check-prefix=IA32 + +; trivial test for correct call suffix + +define i32 @far() nounwind uwtable { +entry: +; CHECK: callq foo +; IA32: calll foo + tail call void @foo() nounwind + ret i32 0 +} + +declare void @foo() diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll index da8082b..8790fa6 100644 --- a/test/CodeGen/X86/x86-64-pic-10.ll +++ b/test/CodeGen/X86/x86-64-pic-10.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1 ; RUN: grep "callq g@PLT" %t1 -@g = alias weak i32 ()* @f +@g = weak alias i32 ()* @f define void @h() { entry: diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll new file mode 100644 index 0000000..c476ffd --- /dev/null +++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s +; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s + +; x32 uses %esp, %ebp as stack and frame pointers + +; CHECK-LABEL: foo +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: movq %rdi, -8(%rbp) +; CHECK: popq %rbp +; X32ABI-LABEL: foo +; X32ABI: pushq %rbp +; X32ABI: movl %esp, %ebp +; X32ABI: movl %edi, -4(%ebp) +; X32ABI: popq %rbp +; NACL-LABEL: foo +; NACL: pushq %rbp +; NACL: movq %rsp, %rbp +; NACL: movl %edi, -4(%rbp) +; NACL: popq %rbp + + +define void @foo(i32* %a) #0 { +entry: + %a.addr = alloca i32*, align 4 + %b = alloca i32*, align 4 + store i32* %a, i32** %a.addr, align 4 + ret void +} + +attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"} + + diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll index 641786f..2879fb4 100644 --- a/test/CodeGen/X86/x86-64-tls-1.ll +++ b/test/CodeGen/X86/x86-64-tls-1.ll @@ -1,10 +1,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s @tm_nest_level = internal thread_local global i32 0 define i64 @z() nounwind { -; FIXME: The codegen here is primitive at best and could be much better. -; The add and the moves can be folded together. -; CHECK-DAG: movq $tm_nest_level@TPOFF, %rcx -; CHECK-DAG: movq %fs:0, %rax -; CHECK: addl %ecx, %eax +; CHECK: movq $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x +; CHECK-NEXT: addl %fs:0, %e[[R0]]x +; CHECK-NEXT: andq $100, %r[[R0]]x + ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100) } diff --git a/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll new file mode 100644 index 0000000..fcf7eae --- /dev/null +++ b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 -mattr=+64bit,+sse2 < %s | FileCheck %s + +; DAGCombine may choose to rewrite 2 loads feeding a select as a select of +; addresses feeding a load. This test ensures that when it does that it creates +; a load with alignment equivalent to the most restrictive source load. + +declare void @sink(<2 x double>) + +define void @test1(i1 %cmp) align 2 { + %1 = alloca <2 x double>, align 16 + %2 = alloca <2 x double>, align 8 + + %val = load <2 x double>* %1, align 16 + %val2 = load <2 x double>* %2, align 8 + %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2 + call void @sink(<2 x double> %val3) + ret void + ; CHECK: test1 + ; CHECK: movups + ; CHECK: ret +} + +define void @test2(i1 %cmp) align 2 { + %1 = alloca <2 x double>, align 16 + %2 = alloca <2 x double>, align 8 + + %val = load <2 x double>* %1, align 16 + %val2 = load <2 x double>* %2, align 16 + %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2 + call void @sink(<2 x double> %val3) + ret void + ; CHECK: test2 + ; CHECK: movaps + ; CHECK: ret +} diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll new file mode 100644 index 0000000..4317d8a --- /dev/null +++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s + +define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { +; CHECK-LABEL: LCPI0_0: +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-LABEL: foo: +; CHECK: cmpeqps %xmm1, %xmm0 +; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0 +; CHECK-NEXT: retq + + %cmp = fcmp oeq <4 x float> %val, %test + %ext = zext <4 x i1> %cmp to <4 x i32> + %result = sitofp <4 x i32> %ext to <4 x float> + ret <4 x float> %result +} + +; Make sure the operation doesn't try to get folded when the sizes don't match, +; as that ends up crashing later when trying to form a bitcast operation for +; the folded nodes. +define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind { +; CHECK-LABEL: LCPI1_0: +; CHECK-NEXT: .long 1 ## 0x1 +; CHECK-NEXT: .long 1 ## 0x1 +; CHECK-NEXT: .long 1 ## 0x1 +; CHECK-NEXT: .long 1 ## 0x1 +; CHECK-LABEL: foo1: +; FIXME: The operation gets scalarized. If/when the compiler learns to better +; use [V]CVTDQ2PD, this will need updated. +; CHECK: cvtsi2sdq +; CHECK: cvtsi2sdq +; CHECK: cvtsi2sdq +; CHECK: cvtsi2sdq + %cmp = fcmp oeq <4 x float> %val, %test + %ext = zext <4 x i1> %cmp to <4 x i32> + %result = sitofp <4 x i32> %ext to <4 x double> + store <4 x double> %result, <4 x double>* %p + ret void +} + +; Also test the general purpose constant folding of int->fp. +define void @foo2(<4 x float>* noalias %result) nounwind { +; CHECK-LABEL: LCPI2_0: +; CHECK-NEXT: .long 1082130432 ## float 4.000000e+00 +; CHECK-NEXT: .long 1084227584 ## float 5.000000e+00 +; CHECK-NEXT: .long 1086324736 ## float 6.000000e+00 +; CHECK-NEXT: .long 1088421888 ## float 7.000000e+00 +; CHECK-LABEL: foo2: +; CHECK: movaps LCPI2_0(%rip), %xmm0 + + %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float> + store <4 x float> %val, <4 x float>* %result + ret void +} + +; Fold explicit AND operations when the constant isn't a splat of a single +; scalar value like what the zext creates. +define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind { +; CHECK-LABEL: LCPI3_0: +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 0 ## 0x0 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 0 ## 0x0 +; CHECK-LABEL: foo3: +; CHECK: cmpeqps %xmm1, %xmm0 +; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0 + %cmp = fcmp oeq <4 x float> %val, %test + %ext = zext <4 x i1> %cmp to <4 x i32> + %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258> + %result = sitofp <4 x i32> %and to <4 x float> + ret <4 x float> %result +} diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll index f078631..54a4d6aa 100644 --- a/test/CodeGen/X86/xaluo.ll +++ b/test/CodeGen/X86/xaluo.ll @@ -1,7 +1,5 @@ -; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=DAG -; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST -; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG +; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST ; ; Get the actual value of the overflow bit. @@ -9,12 +7,9 @@ ; SADDO reg, reg define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) { entry: -; DAG-LABEL: saddo.i8 -; DAG: addb %sil, %dil -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i8 -; FAST: addb %sil, %dil -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i8 +; CHECK: addb %sil, %dil +; CHECK-NEXT: seto %al %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -24,12 +19,9 @@ entry: define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) { entry: -; DAG-LABEL: saddo.i16 -; DAG: addw %si, %di -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i16 -; FAST: addw %si, %di -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i16 +; CHECK: addw %si, %di +; CHECK-NEXT: seto %al %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -39,12 +31,9 @@ entry: define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: saddo.i32 -; DAG: addl %esi, %edi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i32 -; FAST: addl %esi, %edi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: seto %al %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -54,12 +43,9 @@ entry: define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: saddo.i64 -; DAG: addq %rsi, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i64 -; FAST: addq %rsi, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -67,16 +53,48 @@ entry: ret i1 %obit } -; SADDO reg, imm | imm, reg -; FIXME: INC isn't supported in FastISel yet -define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) { +; SADDO reg, 1 | INC +define zeroext i1 @saddo.inc.i8(i8 %v1, i8* %res) { +entry: +; CHECK-LABEL: saddo.inc.i8 +; CHECK: incb %dil +; CHECK-NEXT: seto %al + %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1) + %val = extractvalue {i8, i1} %t, 0 + %obit = extractvalue {i8, i1} %t, 1 + store i8 %val, i8* %res + ret i1 %obit +} + +define zeroext i1 @saddo.inc.i16(i16 %v1, i16* %res) { +entry: +; CHECK-LABEL: saddo.inc.i16 +; CHECK: incw %di +; CHECK-NEXT: seto %al + %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1) + %val = extractvalue {i16, i1} %t, 0 + %obit = extractvalue {i16, i1} %t, 1 + store i16 %val, i16* %res + ret i1 %obit +} + +define zeroext i1 @saddo.inc.i32(i32 %v1, i32* %res) { entry: -; DAG-LABEL: saddo.i64imm1 -; DAG: incq %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i64imm1 -; FAST: addq $1, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.inc.i32 +; CHECK: incl %edi +; CHECK-NEXT: seto %al + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @saddo.inc.i64(i64 %v1, i64* %res) { +entry: +; CHECK-LABEL: saddo.inc.i64 +; CHECK: incq %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -84,17 +102,18 @@ entry: ret i1 %obit } +; SADDO reg, imm | imm, reg ; FIXME: DAG doesn't optimize immediates on the LHS. -define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm2 -; DAG: mov -; DAG-NEXT: addq -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm2 -; FAST: addq $1, %rdi -; FAST-NEXT: seto %al - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1) +; SDAG-LABEL: saddo.i64imm1 +; SDAG: mov +; SDAG-NEXT: addq +; SDAG-NEXT: seto +; FAST-LABEL: saddo.i64imm1 +; FAST: addq $2, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 store i64 %val, i64* %res @@ -102,14 +121,11 @@ entry: } ; Check boundary conditions for large immediates. -define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm3 -; DAG: addq $-2147483648, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i64imm3 -; FAST: addq $-2147483648, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i64imm2 +; CHECK: addq $-2147483648, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -117,16 +133,12 @@ entry: ret i1 %obit } -define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm4 -; DAG: movabsq $-21474836489, %[[REG:[a-z]+]] -; DAG-NEXT: addq %rdi, %[[REG]] -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm4 -; FAST: movabsq $-21474836489, %[[REG:[a-z]+]] -; FAST-NEXT: addq %rdi, %[[REG]] -; FAST-NEXT: seto +; CHECK-LABEL: saddo.i64imm3 +; CHECK: movabsq $-21474836489, %[[REG:[a-z]+]] +; CHECK-NEXT: addq %rdi, %[[REG]] +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -134,14 +146,11 @@ entry: ret i1 %obit } -define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm5 -; DAG: addq $2147483647, %rdi -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm5 -; FAST: addq $2147483647, %rdi -; FAST-NEXT: seto +; CHECK-LABEL: saddo.i64imm4 +; CHECK: addq $2147483647, %rdi +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -149,17 +158,12 @@ entry: ret i1 %obit } -; TODO: FastISel shouldn't use movabsq. -define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm6 -; DAG: movl $2147483648, %ecx -; DAG: addq %rdi, %rcx -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm6 -; FAST: movabsq $2147483648, %[[REG:[a-z]+]] -; FAST: addq %rdi, %[[REG]] -; FAST-NEXT: seto +; CHECK-LABEL: saddo.i64imm5 +; CHECK: movl $2147483648 +; CHECK: addq %rdi +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -170,12 +174,9 @@ entry: ; UADDO define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: uaddo.i32 -; DAG: addl %esi, %edi -; DAG-NEXT: setb %al -; FAST-LABEL: uaddo.i32 -; FAST: addl %esi, %edi -; FAST-NEXT: setb %al +; CHECK-LABEL: uaddo.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: setb %al %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -185,12 +186,9 @@ entry: define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: uaddo.i64 -; DAG: addq %rsi, %rdi -; DAG-NEXT: setb %al -; FAST-LABEL: uaddo.i64 -; FAST: addq %rsi, %rdi -; FAST-NEXT: setb %al +; CHECK-LABEL: uaddo.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: setb %al %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -198,15 +196,57 @@ entry: ret i1 %obit } +; UADDO reg, 1 | NOT INC +define zeroext i1 @uaddo.inc.i8(i8 %v1, i8* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i8 +; CHECK-NOT: incb %dil + %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1) + %val = extractvalue {i8, i1} %t, 0 + %obit = extractvalue {i8, i1} %t, 1 + store i8 %val, i8* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.inc.i16(i16 %v1, i16* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i16 +; CHECK-NOT: incw %di + %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1) + %val = extractvalue {i16, i1} %t, 0 + %obit = extractvalue {i16, i1} %t, 1 + store i16 %val, i16* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.inc.i32(i32 %v1, i32* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i32 +; CHECK-NOT: incl %edi + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.inc.i64(i64 %v1, i64* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i64 +; CHECK-NOT: incq %rdi + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + ; SSUBO define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: ssubo.i32 -; DAG: subl %esi, %edi -; DAG-NEXT: seto %al -; FAST-LABEL: ssubo.i32 -; FAST: subl %esi, %edi -; FAST-NEXT: seto %al +; CHECK-LABEL: ssubo.i32 +; CHECK: subl %esi, %edi +; CHECK-NEXT: seto %al %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -216,12 +256,9 @@ entry: define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: ssubo.i64 -; DAG: subq %rsi, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: ssubo.i64 -; FAST: subq %rsi, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: ssubo.i64 +; CHECK: subq %rsi, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -232,12 +269,9 @@ entry: ; USUBO define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: usubo.i32 -; DAG: subl %esi, %edi -; DAG-NEXT: setb %al -; FAST-LABEL: usubo.i32 -; FAST: subl %esi, %edi -; FAST-NEXT: setb %al +; CHECK-LABEL: usubo.i32 +; CHECK: subl %esi, %edi +; CHECK-NEXT: setb %al %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -247,12 +281,9 @@ entry: define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: usubo.i64 -; DAG: subq %rsi, %rdi -; DAG-NEXT: setb %al -; FAST-LABEL: usubo.i64 -; FAST: subq %rsi, %rdi -; FAST-NEXT: setb %al +; CHECK-LABEL: usubo.i64 +; CHECK: subq %rsi, %rdi +; CHECK-NEXT: setb %al %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -263,10 +294,10 @@ entry: ; SMULO define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) { entry: -; FAST-LABEL: smulo.i8 -; FAST: movb %dil, %al -; FAST-NEXT: imulb %sil -; FAST-NEXT: seto %cl +; CHECK-LABEL: smulo.i8 +; CHECK: movb %dil, %al +; CHECK-NEXT: imulb %sil +; CHECK-NEXT: seto %cl %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -276,12 +307,9 @@ entry: define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) { entry: -; DAG-LABEL: smulo.i16 -; DAG: imulw %si, %di -; DAG-NEXT: seto %al -; FAST-LABEL: smulo.i16 -; FAST: imulw %si, %di -; FAST-NEXT: seto %al +; CHECK-LABEL: smulo.i16 +; CHECK: imulw %si, %di +; CHECK-NEXT: seto %al %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -291,12 +319,9 @@ entry: define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: smulo.i32 -; DAG: imull %esi, %edi -; DAG-NEXT: seto %al -; FAST-LABEL: smulo.i32 -; FAST: imull %esi, %edi -; FAST-NEXT: seto %al +; CHECK-LABEL: smulo.i32 +; CHECK: imull %esi, %edi +; CHECK-NEXT: seto %al %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -306,12 +331,9 @@ entry: define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: smulo.i64 -; DAG: imulq %rsi, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: smulo.i64 -; FAST: imulq %rsi, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: smulo.i64 +; CHECK: imulq %rsi, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -322,10 +344,10 @@ entry: ; UMULO define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) { entry: -; FAST-LABEL: umulo.i8 -; FAST: movb %dil, %al -; FAST-NEXT: mulb %sil -; FAST-NEXT: seto %cl +; CHECK-LABEL: umulo.i8 +; CHECK: movb %dil, %al +; CHECK-NEXT: mulb %sil +; CHECK-NEXT: seto %cl %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -335,12 +357,9 @@ entry: define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) { entry: -; DAG-LABEL: umulo.i16 -; DAG: mulw %si -; DAG-NEXT: seto -; FAST-LABEL: umulo.i16 -; FAST: mulw %si -; FAST-NEXT: seto +; CHECK-LABEL: umulo.i16 +; CHECK: mulw %si +; CHECK-NEXT: seto %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -350,12 +369,9 @@ entry: define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: umulo.i32 -; DAG: mull %esi -; DAG-NEXT: seto -; FAST-LABEL: umulo.i32 -; FAST: mull %esi -; FAST-NEXT: seto +; CHECK-LABEL: umulo.i32 +; CHECK: mull %esi +; CHECK-NEXT: seto %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -365,12 +381,9 @@ entry: define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: umulo.i64 -; DAG: mulq %rsi -; DAG-NEXT: seto -; FAST-LABEL: umulo.i64 -; FAST: mulq %rsi -; FAST-NEXT: seto +; CHECK-LABEL: umulo.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -383,9 +396,9 @@ entry: ; define i32 @saddo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: saddo.select.i32 -; CHECK: addl %esi, %eax -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: saddo.select.i32 +; CHECK: addl %esi, %eax +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -394,9 +407,9 @@ entry: define i64 @saddo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: saddo.select.i64 -; CHECK: addq %rsi, %rax -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: saddo.select.i64 +; CHECK: addq %rsi, %rax +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -405,9 +418,9 @@ entry: define i32 @uaddo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: uaddo.select.i32 -; CHECK: addl %esi, %eax -; CHECK-NEXT: cmovbl %edi, %esi +; CHECK-LABEL: uaddo.select.i32 +; CHECK: addl %esi, %eax +; CHECK-NEXT: cmovbl %edi, %esi %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -416,9 +429,9 @@ entry: define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: uaddo.select.i64 -; CHECK: addq %rsi, %rax -; CHECK-NEXT: cmovbq %rdi, %rsi +; CHECK-LABEL: uaddo.select.i64 +; CHECK: addq %rsi, %rax +; CHECK-NEXT: cmovbq %rdi, %rsi %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -427,9 +440,9 @@ entry: define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: ssubo.select.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: ssubo.select.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -438,9 +451,9 @@ entry: define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: ssubo.select.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: ssubo.select.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -449,9 +462,9 @@ entry: define i32 @usubo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: usubo.select.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: cmovbl %edi, %esi +; CHECK-LABEL: usubo.select.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovbl %edi, %esi %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -460,9 +473,9 @@ entry: define i64 @usubo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: usubo.select.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: cmovbq %rdi, %rsi +; CHECK-LABEL: usubo.select.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovbq %rdi, %rsi %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -471,9 +484,9 @@ entry: define i32 @smulo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: smulo.select.i32 -; CHECK: imull %esi, %eax -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: smulo.select.i32 +; CHECK: imull %esi, %eax +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -482,9 +495,9 @@ entry: define i64 @smulo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: smulo.select.i64 -; CHECK: imulq %rsi, %rax -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: smulo.select.i64 +; CHECK: imulq %rsi, %rax +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -493,9 +506,9 @@ entry: define i32 @umulo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: umulo.select.i32 -; CHECK: mull %esi -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: umulo.select.i32 +; CHECK: mull %esi +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -504,9 +517,9 @@ entry: define i64 @umulo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: umulo.select.i64 -; CHECK: mulq %rsi -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: umulo.select.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -519,9 +532,9 @@ entry: ; define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: saddo.br.i32 -; CHECK: addl %esi, %edi -; CHECK-NEXT: jo +; CHECK-LABEL: saddo.br.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -536,9 +549,9 @@ continue: define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: saddo.br.i64 -; CHECK: addq %rsi, %rdi -; CHECK-NEXT: jo +; CHECK-LABEL: saddo.br.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -553,9 +566,9 @@ continue: define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: uaddo.br.i32 -; CHECK: addl %esi, %edi -; CHECK-NEXT: jb +; CHECK-LABEL: uaddo.br.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: jb %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -570,9 +583,9 @@ continue: define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: uaddo.br.i64 -; CHECK: addq %rsi, %rdi -; CHECK-NEXT: jb +; CHECK-LABEL: uaddo.br.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: jb %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -587,9 +600,9 @@ continue: define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: ssubo.br.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jo +; CHECK-LABEL: ssubo.br.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -604,9 +617,9 @@ continue: define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: ssubo.br.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: jo +; CHECK-LABEL: ssubo.br.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -621,9 +634,9 @@ continue: define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: usubo.br.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jb +; CHECK-LABEL: usubo.br.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: jb %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -638,9 +651,9 @@ continue: define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: usubo.br.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: jb +; CHECK-LABEL: usubo.br.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jb %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -655,9 +668,9 @@ continue: define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: smulo.br.i32 -; CHECK: imull %esi, %edi -; CHECK-NEXT: jo +; CHECK-LABEL: smulo.br.i32 +; CHECK: imull %esi, %edi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -672,9 +685,9 @@ continue: define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: smulo.br.i64 -; CHECK: imulq %rsi, %rdi -; CHECK-NEXT: jo +; CHECK-LABEL: smulo.br.i64 +; CHECK: imulq %rsi, %rdi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -689,9 +702,9 @@ continue: define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: umulo.br.i32 -; CHECK: mull %esi -; CHECK-NEXT: jo +; CHECK-LABEL: umulo.br.i32 +; CHECK: mull %esi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -706,9 +719,9 @@ continue: define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: umulo.br.i64 -; CHECK: mulq %rsi -; CHECK-NEXT: jo +; CHECK-LABEL: umulo.br.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -725,6 +738,8 @@ declare {i8, i1} @llvm.sadd.with.overflow.i8 (i8, i8 ) nounwind readnone declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone +declare {i8, i1} @llvm.uadd.with.overflow.i8 (i8, i8 ) nounwind readnone +declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) nounwind readnone declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone |