diff options
author | Bill Wendling <isanbard@gmail.com> | 2011-03-14 23:02:38 +0000 |
---|---|---|
committer | Bill Wendling <isanbard@gmail.com> | 2011-03-14 23:02:38 +0000 |
commit | 69a05a7b9205fd4628ed614d1845f3879f6be949 (patch) | |
tree | b12b2a678dd4f13c0c4fac08ccb353a4cea999b7 | |
parent | b121bfcc22660b1bdfb1183b191b6516988bcaf5 (diff) | |
download | external_llvm-69a05a7b9205fd4628ed614d1845f3879f6be949.zip external_llvm-69a05a7b9205fd4628ed614d1845f3879f6be949.tar.gz external_llvm-69a05a7b9205fd4628ed614d1845f3879f6be949.tar.bz2 |
Generate a VTBL instruction instead of a series of loads and stores when we
can. As Nate pointed out, VTBL isn't super performant, but it *has* to be better
than this:
_shuf:
@ BB#0: @ %entry
push {r4, r7, lr}
add r7, sp, #4
sub sp, #12
mov r4, sp
bic r4, r4, #7
mov sp, r4
mov r2, sp
vmov d16, r0, r1
orr r0, r2, #6
orr r3, r2, #7
vst1.8 {d16[0]}, [r3]
vst1.8 {d16[5]}, [r0]
subs r4, r7, #4
orr r0, r2, #5
vst1.8 {d16[4]}, [r0]
orr r0, r2, #4
vst1.8 {d16[4]}, [r0]
orr r0, r2, #3
vst1.8 {d16[0]}, [r0]
orr r0, r2, #2
vst1.8 {d16[2]}, [r0]
orr r0, r2, #1
vst1.8 {d16[1]}, [r0]
vst1.8 {d16[3]}, [r2]
vldr.64 d16, [sp]
vmov r0, r1, d16
mov sp, r4
pop {r4, r7, pc}
The "illegal" testcase in vext.ll is no longer illegal.
<rdar://problem/9078775>
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127630 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/ARM/ARMISelDAGToDAG.cpp | 29 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 35 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.h | 4 | ||||
-rw-r--r-- | test/CodeGen/ARM/vext.ll | 12 |
4 files changed, 67 insertions, 13 deletions
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 8ef44d1..c7d847a 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2842,6 +2842,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { break; } + case ARMISD::VTBL1: { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SmallVector<SDValue, 6> Ops; + + Ops.push_back(N->getOperand(0)); + Ops.push_back(N->getOperand(1)); + Ops.push_back(getAL(CurDAG)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size()); + } + case ARMISD::VTBL2: { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDValue RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0); + + SmallVector<SDValue, 6> Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(2)); + Ops.push_back(getAL(CurDAG)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL2Pseudo, dl, VT, + Ops.data(), Ops.size()); + } + case ISD::CONCAT_VECTORS: return SelectConcatVector(N); } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 3a34051..665c823 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -852,6 +852,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::VTBL1: return "ARMISD::VTBL1"; + case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VTBL3: return "ARMISD::VTBL3"; + case ARMISD::VTBL4: return "ARMISD::VTBL4"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; @@ -4055,6 +4059,29 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, } } +static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, + SmallVectorImpl<int> &ShuffleMask, + SelectionDAG &DAG) { + // Check to see if we can use the VTBL instruction. + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + DebugLoc DL = Op.getDebugLoc(); + + SmallVector<SDValue, 8> VTBLMask; + for (SmallVectorImpl<int>::iterator + I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) + VTBLMask.push_back(DAG.getConstant(*I, MVT::i32)); + + if (V2.getNode()->getOpcode() == ISD::UNDEF) + return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, + &VTBLMask[0], 8)); + else + return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, + &VTBLMask[0], 8)); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -4172,6 +4199,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + if (VT == MVT::v8i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); + if (NewOp.getNode()) + return NewOp; + } + return SDValue(); } @@ -4534,7 +4567,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GlobalAddress: return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : LowerGlobalAddressELF(Op, DAG); - case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 0f56201..8eb4525 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -153,6 +153,10 @@ namespace llvm { VZIP, // zip (interleave) VUZP, // unzip (deinterleave) VTRN, // transpose + VTBL1, // 1-register shuffle with mask + VTBL2, // 2-register shuffle with mask + VTBL3, // 3-register shuffle with mask + VTBL4, // 4-register shuffle with mask // Vector multiply long: VMULLs, // ...signed diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll index 55abefe..c8d9045 100644 --- a/test/CodeGen/ARM/vext.ll +++ b/test/CodeGen/ARM/vext.ll @@ -121,15 +121,3 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind { %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ret <4 x i16> %tmp2 } - -; The actual shuffle code only handles some cases, make sure we check -; this rather than blindly emitting a VECTOR_SHUFFLE (infinite -; lowering loop can result otherwise). -define <8 x i8> @test_illegal(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: test_illegal: -;CHECK: vst1.8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <8 x i32> <i32 0, i32 7, i32 5, i32 25, i32 3, i32 2, i32 2, i32 26> - ret <8 x i8> %tmp3 -} |