aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2013-08-14 23:24:32 +0000
committerTom Stellard <thomas.stellard@amd.com>2013-08-14 23:24:32 +0000
commit38d5e1c36d954f1ff6489f58efd1d4865217cf9b (patch)
tree451454dd8bf6ea5ec2f3ea021da2c7f6de4a928a /lib/Target/R600/AMDGPUISelDAGToDAG.cpp
parent636298ba64fd07d4ddcae6005e7fc1db43eb5335 (diff)
downloadexternal_llvm-38d5e1c36d954f1ff6489f58efd1d4865217cf9b.zip
external_llvm-38d5e1c36d954f1ff6489f58efd1d4865217cf9b.tar.gz
external_llvm-38d5e1c36d954f1ff6489f58efd1d4865217cf9b.tar.bz2
R600/SI: Lower BUILD_VECTOR to REG_SEQUENCE v2
Using REG_SEQUENCE for BUILD_VECTOR rather than a series of INSERT_SUBREG instructions should make it easier for the register allocator to coalasce unnecessary copies. v2: - Use an SGPR register class if all the operands of BUILD_VECTOR are SGPRs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188427 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/AMDGPUISelDAGToDAG.cpp')
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp88
1 files changed, 69 insertions, 19 deletions
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index d339b09..22bdb90 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -285,35 +285,85 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}
case ISD::BUILD_VECTOR: {
+ unsigned RegClassID;
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
- if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
- break;
+ const AMDGPURegisterInfo *TRI =
+ static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
+ const SIRegisterInfo *SIRI =
+ static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+ EVT VT = N->getValueType(0);
+ unsigned NumVectorElts = VT.getVectorNumElements();
+ assert(VT.getVectorElementType().bitsEq(MVT::i32));
+ if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ bool UseVReg = true;
+ for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+ U != E; ++U) {
+ if (!U->isMachineOpcode()) {
+ continue;
+ }
+ const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+ if (!RC) {
+ continue;
+ }
+ if (SIRI->isSGPRClass(RC)) {
+ UseVReg = false;
+ }
+ }
+ switch(NumVectorElts) {
+ case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+ AMDGPU::SReg_32RegClassID;
+ break;
+ case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+ AMDGPU::SReg_64RegClassID;
+ break;
+ case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+ AMDGPU::SReg_128RegClassID;
+ break;
+ case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+ AMDGPU::SReg_256RegClassID;
+ break;
+ case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+ AMDGPU::SReg_512RegClassID;
+ break;
+ }
+ } else {
+ // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+ // that adds a 128 bits reg copy when going through TwoAddressInstructions
+ // pass. We want to avoid 128 bits copies as much as possible because they
+ // can't be bundled by our scheduler.
+ switch(NumVectorElts) {
+ case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+ default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ }
}
- unsigned RegClassID;
- switch(N->getValueType(0).getVectorNumElements()) {
- case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
- case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
- default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+
+ if (NumVectorElts == 1) {
+ return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
+ VT.getVectorElementType(),
+ N->getOperand(0), RegClass);
}
- // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
- // that adds a 128 bits reg copy when going through TwoAddressInstructions
- // pass. We want to avoid 128 bits copies as much as possible because they
- // can't be bundled by our scheduler.
- SDValue RegSeqArgs[9] = {
- CurDAG->getTargetConstant(RegClassID, MVT::i32),
- SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
- SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
- SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
- SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
- };
+
+ assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+ "supported yet");
+ // 16 = Max Num Vector Elements
+ // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+ // 1 = Vector Register Class
+ SDValue RegSeqArgs[16 * 2 + 1];
+
+ RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
bool IsRegSeq = true;
for (unsigned i = 0; i < N->getNumOperands(); i++) {
+ // XXX: Why is this here?
if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
IsRegSeq = false;
break;
}
- RegSeqArgs[2 * i + 1] = N->getOperand(i);
+ RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
}
if (!IsRegSeq)
break;