diff options
-rw-r--r-- | lib/Target/X86/Utils/X86ShuffleDecode.cpp | 25 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 67 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFragmentsSIMD.td | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 8 | ||||
-rw-r--r-- | test/CodeGen/X86/SIMD/dg.exp | 5 | ||||
-rw-r--r-- | test/CodeGen/X86/SIMD/notvunpcklpd.ll | 20 | ||||
-rw-r--r-- | test/CodeGen/X86/SIMD/notvunpcklps.ll | 20 | ||||
-rw-r--r-- | test/CodeGen/X86/SIMD/vunpcklpd.ll | 20 | ||||
-rw-r--r-- | test/CodeGen/X86/SIMD/vunpcklps.ll | 20 |
9 files changed, 163 insertions, 24 deletions
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index eeb83c1..cd06060 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -165,12 +165,25 @@ void DecodeUNPCKLPDMask(unsigned NElts, /// datatypes and vector widths. void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { - - unsigned NElts = VT.getVectorNumElements(); - - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i); // Reads from dest - ShuffleMask.push_back(i+NElts); // Reads from src + unsigned NumElts = VT.getVectorNumElements(); + + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElts / NumSections; + + unsigned Start = 0; + unsigned End = NumSectionElts / 2; + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = Start; i != End; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumSectionElts); // Reads from src/src2 + } + // Process the next 128 bits. + Start += NumSectionElts; + End += NumSectionElts; } } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 09ec69d..722202d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3173,7 +3173,8 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { unsigned NumElems = N->getValueType(0).getVectorNumElements(); - if (NumElems != 2 && NumElems != 4) + if ((NumElems != 2 && NumElems != 4) + || N->getValueType(0).getSizeInBits() > 128) return false; for (unsigned i = 0; i < NumElems/2; ++i) @@ -3195,19 +3196,36 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return false; - for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElts / NumSections; + + unsigned Start = 0; + unsigned End = NumSectionElts; + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = Start, j = s * NumSectionElts; + i != End; + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) + return false; + } } + // Process the next 128 bits. + Start += NumSectionElts; + End += NumSectionElts; } + return true; } @@ -3255,14 +3273,27 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) return false; - for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElems / NumSections; + + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = s * NumSectionElts, j = s * NumSectionElts; + i != NumSectionElts * (s + 1); + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } } + return true; } diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 5016c0f..3cbfac1 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -132,6 +132,8 @@ def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; +def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>; +def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>; def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>; def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index b912949..45e9051 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5622,11 +5622,15 @@ def : Pat<(X86Movddup (bc_v2f64 // Shuffle with UNPCKLPS def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; +def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (UNPCKLPSrm VR128:$src1, addr:$src2)>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; +def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), (UNPCKLPSrr VR128:$src1, VR128:$src2)>; @@ -5644,11 +5648,15 @@ def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), // Shuffle with UNPCKLPD def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; +def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), (UNPCKLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; +def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), (UNPCKLPDrr VR128:$src1, VR128:$src2)>; diff --git a/test/CodeGen/X86/SIMD/dg.exp b/test/CodeGen/X86/SIMD/dg.exp new file mode 100644 index 0000000..629a147 --- /dev/null +++ b/test/CodeGen/X86/SIMD/dg.exp @@ -0,0 +1,5 @@ +load_lib llvm.exp + +if { [llvm_supports_target X86] } { + RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]] +} diff --git a/test/CodeGen/X86/SIMD/notvunpcklpd.ll b/test/CodeGen/X86/SIMD/notvunpcklpd.ll new file mode 100644 index 0000000..3afc2f2 --- /dev/null +++ b/test/CodeGen/X86/SIMD/notvunpcklpd.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mattr=+avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +define void @try_([2 x <4 x double>]* noalias %incarray, [2 x <4 x double>]* noalias %incarrayb ) { +entry: + %incarray1 = alloca [2 x <4 x double>]*, align 8 + %incarrayb1 = alloca [2 x <4 x double>]*, align 8 + %carray = alloca [2 x <4 x double>], align 16 + %r = getelementptr [2 x <4 x double>]* %incarray, i32 0, i32 0 + %rb = getelementptr [2 x <4 x double>]* %incarrayb, i32 0, i32 0 + %r3 = load <4 x double>* %r, align 8 + %r4 = load <4 x double>* %rb, align 8 + %r11 = shufflevector <4 x double> %r3, <4 x double> %r4, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x double>> [#uses=1] +; CHECK-NOT: vunpcklpd + %r12 = getelementptr [2 x <4 x double>]* %carray, i32 0, i32 1 + store <4 x double> %r11, <4 x double>* %r12, align 4 + ret void +} diff --git a/test/CodeGen/X86/SIMD/notvunpcklps.ll b/test/CodeGen/X86/SIMD/notvunpcklps.ll new file mode 100644 index 0000000..19daa3e --- /dev/null +++ b/test/CodeGen/X86/SIMD/notvunpcklps.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mattr=+avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +define void @try_([2 x <8 x float>]* noalias %incarray, [2 x <8 x float>]* noalias %incarrayb ) { +enmtry: + %incarray1 = alloca [2 x <8 x float>]*, align 8 + %incarrayb1 = alloca [2 x <8 x float>]*, align 8 + %carray = alloca [2 x <8 x float>], align 16 + %r = getelementptr [2 x <8 x float>]* %incarray, i32 0, i32 0 + %rb = getelementptr [2 x <8 x float>]* %incarrayb, i32 0, i32 0 + %r3 = load <8 x float>* %r, align 8 + %r4 = load <8 x float>* %rb, align 8 + %r8 = shufflevector <8 x float> %r3, <8 x float> %r4, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x float>> [#uses=1] +; CHECK-NOT: vunpcklps + %r9 = getelementptr [2 x <8 x float>]* %carray, i32 0, i32 0 + store <8 x float> %r8, <8 x float>* %r9, align 4 + ret void +} diff --git a/test/CodeGen/X86/SIMD/vunpcklpd.ll b/test/CodeGen/X86/SIMD/vunpcklpd.ll new file mode 100644 index 0000000..60d23a4 --- /dev/null +++ b/test/CodeGen/X86/SIMD/vunpcklpd.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mattr=+avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +define void @try_([2 x <4 x double>]* noalias %incarray, [2 x <4 x double>]* noalias %incarrayb ) { +entry: + %incarray1 = alloca [2 x <4 x double>]*, align 8 + %incarrayb1 = alloca [2 x <4 x double>]*, align 8 + %carray = alloca [2 x <4 x double>], align 16 + %r = getelementptr [2 x <4 x double>]* %incarray, i32 0, i32 0 + %rb = getelementptr [2 x <4 x double>]* %incarrayb, i32 0, i32 0 + %r3 = load <4 x double>* %r, align 8 + %r4 = load <4 x double>* %rb, align 8 + %r11 = shufflevector <4 x double> %r3, <4 x double> %r4, <4 x i32> < i32 0, i32 4, i32 2, i32 6 > ; <<4 x double>> [#uses=1] +; CHECK: vunpcklpd + %r12 = getelementptr [2 x <4 x double>]* %carray, i32 0, i32 1 + store <4 x double> %r11, <4 x double>* %r12, align 4 + ret void +} diff --git a/test/CodeGen/X86/SIMD/vunpcklps.ll b/test/CodeGen/X86/SIMD/vunpcklps.ll new file mode 100644 index 0000000..a87b299 --- /dev/null +++ b/test/CodeGen/X86/SIMD/vunpcklps.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mattr=+avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +define void @try_([2 x <8 x float>]* noalias %incarray, [2 x <8 x float>]* noalias %incarrayb ) { +entry: + %incarray1 = alloca [2 x <8 x float>]*, align 8 + %incarrayb1 = alloca [2 x <8 x float>]*, align 8 + %carray = alloca [2 x <8 x float>], align 16 + %r = getelementptr [2 x <8 x float>]* %incarray, i32 0, i32 0 + %rb = getelementptr [2 x <8 x float>]* %incarrayb, i32 0, i32 0 + %r3 = load <8 x float>* %r, align 8 + %r4 = load <8 x float>* %rb, align 8 + %r11 = shufflevector <8 x float> %r3, <8 x float> %r4, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13 > ; <<8 x float>> [#uses=1] +; CHECK: vunpcklps + %r12 = getelementptr [2 x <8 x float>]* %carray, i32 0, i32 1 + store <8 x float> %r11, <8 x float>* %r12, align 4 + ret void +} |