aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2013-08-01 15:23:42 +0000
committerTom Stellard <thomas.stellard@amd.com>2013-08-01 15:23:42 +0000
commit692ee102ebef535d311c35d53457028083e5c5be (patch)
tree5966632bb87e4120a27dadfce4187535429a4275 /lib/Target/R600
parent98b357e1cd0d41108e6011725dad6a6dbf208a38 (diff)
downloadexternal_llvm-692ee102ebef535d311c35d53457028083e5c5be.zip
external_llvm-692ee102ebef535d311c35d53457028083e5c5be.tar.gz
external_llvm-692ee102ebef535d311c35d53457028083e5c5be.tar.bz2
R600: Add 64-bit float load/store support
* Added R600_Reg64 class * Added T#Index#.XY registers definition * Added v2i32 register reads from parameter and global space * Added f32 and i32 elements extraction from v2f32 and v2i32 * Added v2i32 -> v2f32 conversions Tom Stellard: - Mark vec2 operations as expand. The addition of a vec2 register class made them all legal. Patch by: Dmitry Cherkassov Signed-off-by: Dmitry Cherkassov <dcherkassov@gmail.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187582 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPUCallingConv.td10
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp9
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp3
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp4
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp21
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp19
-rw-r--r--lib/Target/R600/R600Instructions.td80
-rw-r--r--lib/Target/R600/R600RegisterInfo.td16
8 files changed, 139 insertions, 23 deletions
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 3865c62..fc95d58 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -38,11 +38,11 @@ def CC_SI : CallingConv<[
// Calling convention for compute kernels
def CC_AMDGPU_Kernel : CallingConv<[
- CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>,
- CCIfType<[i64, f64], CCAssignToStack < 8, 8>>,
- CCIfType<[i32, f32], CCAssignToStack < 4, 4>>,
- CCIfType<[i16], CCAssignToStack < 2, 4>>,
- CCIfType<[i8], CCAssignToStack < 1, 4>>
+ CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>,
+ CCIfType<[i64, f64, v2f32, v2i32], CCAssignToStack < 8, 8>>,
+ CCIfType<[i32, f32], CCAssignToStack < 4, 4>>,
+ CCIfType<[i16], CCAssignToStack < 2, 4>>,
+ CCIfType<[i8], CCAssignToStack < 1, 4>>
]>;
def CC_AMDGPU : CallingConv<[
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index 307b804..38a5f24 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -260,12 +260,19 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
break;
}
+
+ unsigned RegClassID;
+ switch(N->getValueType(0).getVectorNumElements()) {
+ case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+ default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ }
// BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
// that adds a 128 bits reg copy when going through TwoAddressInstructions
// pass. We want to avoid 128 bits copies as much as possible because they
// can't be bundled by our scheduler.
SDValue RegSeqArgs[9] = {
- CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+ CurDAG->getTargetConstant(RegClassID, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 1694387..5db36b0 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -79,6 +79,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand);
+
setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index cc45891..715be37 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -378,8 +378,10 @@ public:
case AMDGPU::R600_ExportBuf:
case AMDGPU::R600_ExportSwz:
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
- case AMDGPU::RAT_STORE_DWORD_cm:
+ case AMDGPU::RAT_STORE_DWORD32_cm:
+ case AMDGPU::RAT_STORE_DWORD64_cm:
DEBUG(dbgs() << CfCount << ":"; MI->dump(););
CfCount++;
break;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 303c0e1..ce6ac89 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -33,17 +33,25 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+ addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
computeRegisterProperties();
setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+ setOperationAction(ISD::FADD, MVT::v2f32, Expand);
setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+ setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+ setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+ setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
setOperationAction(ISD::FCOS, MVT::f32, Custom);
setOperationAction(ISD::FSIN, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -66,7 +74,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v2i32, Expand);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
@@ -74,7 +82,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
- setOperationAction(ISD::STORE, MVT::v2i32, Expand);
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -170,6 +178,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
@@ -1129,7 +1138,13 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
}
- Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
+ EVT NewVT = MVT::v4i32;
+ unsigned NumElements = 4;
+ if (VT.isVector()) {
+ NewVT = VT;
+ NumElements = VT.getVectorNumElements();
+ }
+ Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
} else {
// non constant ptr cant be folded, keeps it as a v4f32 load
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 2e9b732..4e7eff9 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -51,9 +51,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
- if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
- && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
- for (unsigned I = 0; I < 4; I++) {
+ unsigned VectorComponents = 0;
+ if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
+ AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+ VectorComponents = 4;
+ } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
+ AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+ VectorComponents = 2;
+ }
+
+ if (VectorComponents > 0) {
+ for (unsigned I = 0; I < VectorComponents; I++) {
unsigned SubRegIndex = RI.getSubRegFromChannel(I);
buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
RI.getSubReg(DestReg, SubRegIndex),
@@ -62,11 +70,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
RegState::Define | RegState::Implicit);
}
} else {
-
- // We can't copy vec4 registers
- assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
- && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
-
MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
DestReg, SrcReg);
NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 178e081..7e61b18 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1290,6 +1290,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
[(global_store i32:$rw_gpr, i32:$index_gpr)]
>;
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+ (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+ 0x3, "RAT_WRITE_CACHELESS_64_eg $rw_gpr.XY, $index_gpr, $eop",
+ [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
//128-bit store
def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
@@ -1358,6 +1365,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
let Constraints = "$src_gpr.ptr = $dst_gpr";
}
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+ : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
+ (outs R600_Reg64:$dst_gpr), pattern> {
+
+ let MEGA_FETCH_COUNT = 8;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 7;
+ let DST_SEL_W = 7;
+ let DATA_FORMAT = 0x1D; // COLOR_32_32
+}
+
class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
: VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
(outs R600_Reg128:$dst_gpr), pattern> {
@@ -1391,6 +1410,10 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
[(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
>;
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+ [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
[(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
>;
@@ -1413,6 +1436,11 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
[(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
>;
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+ [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
// 128-bit reads
def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
[(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
@@ -1744,15 +1772,23 @@ def : Pat <
def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-def RAT_STORE_DWORD_cm : EG_CF_RAT <
- 0x57, 0x14, 0x1, (outs),
- (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
- "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr",
- [(global_store i32:$rw_gpr, i32:$index_gpr)]
+class RAT_STORE_DWORD_cm <bits<4> mask, dag ins, list<dag> pat> : EG_CF_RAT <
+ 0x57, 0x14, mask, (outs), ins,
+ "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", pat
> {
let eop = 0; // This bit is not used on Cayman.
}
+def RAT_STORE_DWORD32_cm : RAT_STORE_DWORD_cm <0x1,
+ (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
+ [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+def RAT_STORE_DWORD64_cm : RAT_STORE_DWORD_cm <0x3,
+ (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr),
+ [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
: VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
@@ -1815,6 +1851,17 @@ class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
let Constraints = "$src_gpr.ptr = $dst_gpr";
}
+class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
+ : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
+ (outs R600_Reg64:$dst_gpr), pattern> {
+
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 7;
+ let DST_SEL_W = 7;
+ let DATA_FORMAT = 0x1D; // COLOR_32_32
+}
+
class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
: VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
(outs R600_Reg128:$dst_gpr), pattern> {
@@ -1846,6 +1893,10 @@ def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
[(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
>;
+def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
+ [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
[(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
>;
@@ -1868,6 +1919,11 @@ def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
[(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
>;
+// 64-bit reads
+def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
+ [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
// 128-bit reads
def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
[(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
@@ -2297,10 +2353,24 @@ def : Insert_Element <i32, v4i32, 3, sub3>;
def : Vector4_Build <v4f32, f32>;
def : Vector4_Build <v4i32, i32>;
+def : Extract_Element <f32, v2f32, 0, sub0>;
+def : Extract_Element <f32, v2f32, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, 0, sub0>;
+def : Insert_Element <f32, v2f32, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, 0, sub0>;
+def : Extract_Element <i32, v2i32, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, 0, sub0>;
+def : Insert_Element <i32, v2i32, 1, sub1>;
+
// bitconvert patterns
def : BitConvert <i32, f32, R600_Reg32>;
def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
def : BitConvert <v4f32, v4i32, R600_Reg128>;
def : BitConvert <v4i32, v4f32, R600_Reg128>;
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 1eabccb..fa987cf 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
let HWEncoding = encoding;
}
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+ RegisterWithSubRegs<n, subregs> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = encoding;
+}
+
+
foreach Index = 0-127 in {
foreach Chan = [ "X", "Y", "Z", "W" ] in {
// 32-bit Temporary Registers
@@ -41,6 +49,11 @@ foreach Index = 0-127 in {
!cast<Register>("T"#Index#"_Z"),
!cast<Register>("T"#Index#"_W")],
Index>;
+
+ def T#Index#_XY : R600Reg_64 <"T"#Index#"",
+ [!cast<Register>("T"#Index#"_X"),
+ !cast<Register>("T"#Index#"_Y")],
+ Index>;
}
// KCACHE_BANK0
@@ -186,6 +199,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
let CopyCost = -1;
}
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+ (add (sequence "T%u_XY", 0, 63))>;
+
//===----------------------------------------------------------------------===//
// Register classes for indirect addressing
//===----------------------------------------------------------------------===//