diff options
author | Bob Wilson <bob.wilson@apple.com> | 2010-11-30 00:00:35 +0000 |
---|---|---|
committer | Bob Wilson <bob.wilson@apple.com> | 2010-11-30 00:00:35 +0000 |
commit | 6c4c982f83eea655e0f14610d2689fad722aeb7d (patch) | |
tree | 40faf3d38d5f3656df7e4b2439db5389c8a622b9 | |
parent | e76473d9ba1222cb38958d5b05204417e8c2f469 (diff) | |
download | external_llvm-6c4c982f83eea655e0f14610d2689fad722aeb7d.zip external_llvm-6c4c982f83eea655e0f14610d2689fad722aeb7d.tar.gz external_llvm-6c4c982f83eea655e0f14610d2689fad722aeb7d.tar.bz2 |
Add support for NEON VLD3-dup instructions.
The encoding for alignment in VLD4-dup instructions is still a work in progress.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120356 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/ARM/ARMExpandPseudoInsts.cpp | 13 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelDAGToDAG.cpp | 6 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 54 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSchedule.td | 2 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleA8.td | 12 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleA9.td | 18 | ||||
-rw-r--r-- | test/CodeGen/ARM/vlddup.ll | 23 |
7 files changed, 127 insertions, 1 deletions
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index a78ed26..1cc5bd6 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -204,6 +204,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = { { ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, EvenDblSpc, 3, 8 }, { ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, OddDblSpc, 3, 8 }, +{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, SingleSpc, 4, 4}, +{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, SingleSpc, 4, 4}, +{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, SingleSpc, 4, 2}, +{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, SingleSpc, 4, 2}, +{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, SingleSpc, 4, 8}, +{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, SingleSpc, 4, 8}, + { ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, SingleSpc, 4, 4 }, { ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, SingleSpc, 4, 4 }, { ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, SingleSpc, 4, 2 }, @@ -959,6 +966,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { case ARM::VLD3DUPd8Pseudo_UPD: case ARM::VLD3DUPd16Pseudo_UPD: case ARM::VLD3DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: ExpandVLD(MBBI); break; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index a3b86cb..96bdf48 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2367,6 +2367,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { return SelectVLDDup(N, 3, Opcodes); } + case ARMISD::VLD4DUP: { + unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo }; + return SelectVLDDup(N, 4, Opcodes); + } + case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index ebb9d5d..abe0ae1 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -939,7 +939,59 @@ def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; // VLD4DUP : Vector Load (single 4-element structure to all lanes) -// FIXME: Not yet implemented. +class VLD4DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD4dup, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> { + let Rm = 0b1111; +} + +def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8"> { let Inst{4} = Rn{4}; } +def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16"> { let Inst{4} = Rn{4}; } +def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { + let Inst{6} = Rn{5}; + let Inst{4} = Rn{5}; +} + +def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD4DUPd8x2 : VLD4DUP<{0,0,1,?}, "8"> { let Inst{4} = Rn{4}; } +def VLD4DUPd16x2 : VLD4DUP<{0,1,1,?}, "16"> { let Inst{4} = Rn{4}; } +def VLD4DUPd32x2 : VLD4DUP<{1,?,1,?}, "32"> { + let Inst{6} = Rn{5}; + let Inst{4} = Rn{5}; +} + +// ...with address register writeback: +class VLD4DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4dupu, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []>; + +def VLD4DUPd8_UPD : VLD4DUPWB<{0,0,0,0}, "8"> { let Inst{4} = Rn{4}; } +def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16"> { let Inst{4} = Rn{4}; } +def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { + let Inst{6} = Rn{5}; + let Inst{4} = Rn{5}; +} + +def VLD4DUPd8x2_UPD : VLD4DUPWB<{0,0,1,0}, "8"> { let Inst{4} = Rn{4}; } +def VLD4DUPd16x2_UPD : VLD4DUPWB<{0,1,1,?}, "16"> { let Inst{4} = Rn{4}; } +def VLD4DUPd32x2_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { + let Inst{6} = Rn{5}; + let Inst{4} = Rn{5}; +} + +def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; + } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 6300043..61dd3be 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -158,6 +158,8 @@ def IIC_VLD4 : InstrItinClass; def IIC_VLD4ln : InstrItinClass; def IIC_VLD4u : InstrItinClass; def IIC_VLD4lnu : InstrItinClass; +def IIC_VLD4dup : InstrItinClass; +def IIC_VLD4dupu : InstrItinClass; def IIC_VST1 : InstrItinClass; def IIC_VST1x2 : InstrItinClass; def IIC_VST1x3 : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 1e9ec07..33ba683 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -595,6 +595,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<5, [A8_LSPipe]>], [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1, 1]>, + // // VST1 InstrItinData<IIC_VST1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NLSPipe], 0>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index a253b98..e4ae75a 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -995,6 +995,24 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<5, [A9_LSUnit]>], [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>, // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1, 1]>, + // // VST1 InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll index 8d78dfb..5822d3c 100644 --- a/test/CodeGen/ARM/vlddup.ll +++ b/test/CodeGen/ARM/vlddup.ll @@ -91,3 +91,26 @@ define <4 x i16> @vld3dupi16(i16* %A) nounwind { } declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly + +%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } + +define <2 x i32> @vld4dupi32(i32* %A) nounwind { +;CHECK: vld4dupi32: +;Check the alignment value. Max for this instruction is 128 bits: +;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0, :128] + %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 32) + %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0 + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer + %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1 + %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer + %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 2 + %tmp6 = shufflevector <2 x i32> %tmp5, <2 x i32> undef, <2 x i32> zeroinitializer + %tmp7 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 3 + %tmp8 = shufflevector <2 x i32> %tmp7, <2 x i32> undef, <2 x i32> zeroinitializer + %tmp9 = add <2 x i32> %tmp2, %tmp4 + %tmp10 = add <2 x i32> %tmp6, %tmp8 + %tmp11 = add <2 x i32> %tmp9, %tmp10 + ret <2 x i32> %tmp11 +} + +declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly |