From 692ee102ebef535d311c35d53457028083e5c5be Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 1 Aug 2013 15:23:42 +0000 Subject: R600: Add 64-bit float load/store support * Added R600_Reg64 class * Added T#Index#.XY registers definition * Added v2i32 register reads from parameter and global space * Added f32 and i32 elements extraction from v2f32 and v2i32 * Added v2i32 -> v2f32 conversions Tom Stellard: - Mark vec2 operations as expand. The addition of a vec2 register class made them all legal. Patch by: Dmitry Cherkassov Signed-off-by: Dmitry Cherkassov git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187582 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUCallingConv.td | 10 ++-- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 9 +++- lib/Target/R600/AMDGPUISelLowering.cpp | 3 ++ lib/Target/R600/R600ControlFlowFinalizer.cpp | 4 +- lib/Target/R600/R600ISelLowering.cpp | 21 ++++++-- lib/Target/R600/R600InstrInfo.cpp | 19 ++++--- lib/Target/R600/R600Instructions.td | 80 ++++++++++++++++++++++++++-- lib/Target/R600/R600RegisterInfo.td | 16 ++++++ 8 files changed, 139 insertions(+), 23 deletions(-) (limited to 'lib/Target/R600') diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 3865c62..fc95d58 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -38,11 +38,11 @@ def CC_SI : CallingConv<[ // Calling convention for compute kernels def CC_AMDGPU_Kernel : CallingConv<[ - CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>, - CCIfType<[i64, f64], CCAssignToStack < 8, 8>>, - CCIfType<[i32, f32], CCAssignToStack < 4, 4>>, - CCIfType<[i16], CCAssignToStack < 2, 4>>, - CCIfType<[i8], CCAssignToStack < 1, 4>> + CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>, + CCIfType<[i64, f64, v2f32, v2i32], CCAssignToStack < 8, 8>>, + CCIfType<[i32, f32], CCAssignToStack < 4, 4>>, + CCIfType<[i16], CCAssignToStack < 2, 4>>, + CCIfType<[i8], CCAssignToStack < 1, 4>> ]>; def CC_AMDGPU : CallingConv<[ diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 307b804..38a5f24 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -260,12 +260,19 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } + + unsigned RegClassID; + switch(N->getValueType(0).getVectorNumElements()) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { - CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + CurDAG->getTargetConstant(RegClassID, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 1694387..5db36b0 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -79,6 +79,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::f64, Promote); AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand); + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); setOperationAction(ISD::FNEG, MVT::v4f32, Expand); diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index cc45891..715be37 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -378,8 +378,10 @@ public: case AMDGPU::R600_ExportBuf: case AMDGPU::R600_ExportSwz: case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: - case AMDGPU::RAT_STORE_DWORD_cm: + case AMDGPU::RAT_STORE_DWORD32_cm: + case AMDGPU::RAT_STORE_DWORD64_cm: DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 303c0e1..ce6ac89 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -33,17 +33,25 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + computeRegisterProperties(); setOperationAction(ISD::FADD, MVT::v4f32, Expand); + setOperationAction(ISD::FADD, MVT::v2f32, Expand); setOperationAction(ISD::FMUL, MVT::v4f32, Expand); + setOperationAction(ISD::FMUL, MVT::v2f32, Expand); setOperationAction(ISD::FDIV, MVT::v4f32, Expand); + setOperationAction(ISD::FDIV, MVT::v2f32, Expand); setOperationAction(ISD::FSUB, MVT::v4f32, Expand); + setOperationAction(ISD::FSUB, MVT::v2f32, Expand); setOperationAction(ISD::FCOS, MVT::f32, Custom); setOperationAction(ISD::FSIN, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + setOperationAction(ISD::SETCC, MVT::v2i32, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); @@ -66,7 +74,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Expand); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); @@ -74,7 +82,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Expand); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::i32, Custom); @@ -170,6 +178,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; @@ -1129,7 +1138,13 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements); } else { // non constant ptr cant be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 2e9b732..4e7eff9 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -51,9 +51,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { - if (AMDGPU::R600_Reg128RegClass.contains(DestReg) - && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { - for (unsigned I = 0; I < 4; I++) { + unsigned VectorComponents = 0; + if (AMDGPU::R600_Reg128RegClass.contains(DestReg) && + AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { + VectorComponents = 4; + } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) && + AMDGPU::R600_Reg64RegClass.contains(SrcReg)) { + VectorComponents = 2; + } + + if (VectorComponents > 0) { + for (unsigned I = 0; I < VectorComponents; I++) { unsigned SubRegIndex = RI.getSubRegFromChannel(I); buildDefaultInstruction(MBB, MI, AMDGPU::MOV, RI.getSubReg(DestReg, SubRegIndex), @@ -62,11 +70,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, RegState::Define | RegState::Implicit); } } else { - - // We can't copy vec4 registers - assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg) - && !AMDGPU::R600_Reg128RegClass.contains(SrcReg)); - MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, DestReg, SrcReg); NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 178e081..7e61b18 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1290,6 +1290,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg < [(global_store i32:$rw_gpr, i32:$index_gpr)] >; +// 64-bit store +def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg < + (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + 0x3, "RAT_WRITE_CACHELESS_64_eg $rw_gpr.XY, $index_gpr, $eop", + [(global_store v2i32:$rw_gpr, i32:$index_gpr)] +>; + //128-bit store def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg < (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), @@ -1358,6 +1365,18 @@ class VTX_READ_32_eg buffer_id, list pattern> let Constraints = "$src_gpr.ptr = $dst_gpr"; } +class VTX_READ_64_eg buffer_id, list pattern> + : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 8; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + class VTX_READ_128_eg buffer_id, list pattern> : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, (outs R600_Reg128:$dst_gpr), pattern> { @@ -1391,6 +1410,10 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; @@ -1413,6 +1436,11 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] >; +// 64-bit reads +def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + // 128-bit reads def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] @@ -1744,15 +1772,23 @@ def : Pat < def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; -def RAT_STORE_DWORD_cm : EG_CF_RAT < - 0x57, 0x14, 0x1, (outs), - (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr), - "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", - [(global_store i32:$rw_gpr, i32:$index_gpr)] +class RAT_STORE_DWORD_cm mask, dag ins, list pat> : EG_CF_RAT < + 0x57, 0x14, mask, (outs), ins, + "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", pat > { let eop = 0; // This bit is not used on Cayman. } +def RAT_STORE_DWORD32_cm : RAT_STORE_DWORD_cm <0x1, + (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr), + [(global_store i32:$rw_gpr, i32:$index_gpr)] +>; + +def RAT_STORE_DWORD64_cm : RAT_STORE_DWORD_cm <0x3, + (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr), + [(global_store v2i32:$rw_gpr, i32:$index_gpr)] +>; + class VTX_READ_cm buffer_id, dag outs, list pattern> : VTX_WORD0_cm, VTX_READ { @@ -1815,6 +1851,17 @@ class VTX_READ_32_cm buffer_id, list pattern> let Constraints = "$src_gpr.ptr = $dst_gpr"; } +class VTX_READ_64_cm buffer_id, list pattern> + : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + class VTX_READ_128_cm buffer_id, list pattern> : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, (outs R600_Reg128:$dst_gpr), pattern> { @@ -1846,6 +1893,10 @@ def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; +def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; @@ -1868,6 +1919,11 @@ def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] >; +// 64-bit reads +def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + // 128-bit reads def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] @@ -2297,10 +2353,24 @@ def : Insert_Element ; def : Vector4_Build ; def : Vector4_Build ; +def : Extract_Element ; +def : Extract_Element ; + +def : Insert_Element ; +def : Insert_Element ; + +def : Extract_Element ; +def : Extract_Element ; + +def : Insert_Element ; +def : Insert_Element ; + // bitconvert patterns def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; def : BitConvert ; def : BitConvert ; diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 1eabccb..fa987cf 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -23,6 +23,14 @@ class R600Reg_128 subregs, bits<16> encoding> : let HWEncoding = encoding; } +class R600Reg_64 subregs, bits<16> encoding> : + RegisterWithSubRegs { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = encoding; +} + + foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { // 32-bit Temporary Registers @@ -41,6 +49,11 @@ foreach Index = 0-127 in { !cast("T"#Index#"_Z"), !cast("T"#Index#"_W")], Index>; + + def T#Index#_XY : R600Reg_64 <"T"#Index#"", + [!cast("T"#Index#"_X"), + !cast("T"#Index#"_Y")], + Index>; } // KCACHE_BANK0 @@ -186,6 +199,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, let CopyCost = -1; } +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add (sequence "T%u_XY", 0, 63))>; + //===----------------------------------------------------------------------===// // Register classes for indirect addressing //===----------------------------------------------------------------------===// -- cgit v1.1