aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPU.h2
-rw-r--r--lib/Target/R600/AMDGPU.td1
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp2
-rw-r--r--lib/Target/R600/AMDGPUCallingConv.td42
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp20
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h9
-rw-r--r--lib/Target/R600/AMDGPUIndirectAddressing.cpp1
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td11
-rw-r--r--lib/Target/R600/AMDGPUIntrinsics.td2
-rw-r--r--lib/Target/R600/AMDGPUStructurizeCFG.cpp1
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp23
-rw-r--r--lib/Target/R600/AMDIL.h35
-rw-r--r--lib/Target/R600/AMDILCFGStructurizer.cpp8
-rw-r--r--lib/Target/R600/AMDILDevice.cpp20
-rw-r--r--lib/Target/R600/AMDILISelDAGToDAG.cpp62
-rw-r--r--lib/Target/R600/AMDILISelLowering.cpp5
-rw-r--r--lib/Target/R600/AMDILSIDevice.cpp15
-rw-r--r--lib/Target/R600/CMakeLists.txt3
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h9
-rw-r--r--lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp32
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp212
-rw-r--r--lib/Target/R600/R600ISelLowering.h3
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp122
-rw-r--r--lib/Target/R600/R600InstrInfo.h3
-rw-r--r--lib/Target/R600/R600Instructions.td63
-rw-r--r--lib/Target/R600/R600LowerConstCopy.cpp222
-rw-r--r--lib/Target/R600/R600MachineFunctionInfo.cpp1
-rw-r--r--lib/Target/R600/R600MachineFunctionInfo.h1
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp427
-rw-r--r--lib/Target/R600/R600MachineScheduler.h120
-rw-r--r--lib/Target/R600/SIAssignInterpRegs.cpp152
-rw-r--r--lib/Target/R600/SIISelLowering.cpp265
-rw-r--r--lib/Target/R600/SIISelLowering.h13
-rw-r--r--lib/Target/R600/SIInsertWaits.cpp9
-rw-r--r--lib/Target/R600/SIInstrFormats.td20
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp104
-rw-r--r--lib/Target/R600/SIInstrInfo.td25
-rw-r--r--lib/Target/R600/SIInstructions.td300
-rw-r--r--lib/Target/R600/SIIntrinsics.td21
-rw-r--r--lib/Target/R600/SILowerControlFlow.cpp117
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.cpp20
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.h4
-rw-r--r--lib/Target/R600/SIRegisterInfo.td69
43 files changed, 1561 insertions, 1035 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index ba87918..e099a9f 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -23,11 +23,9 @@ class AMDGPUTargetMachine;
// R600 Passes
FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
-FunctionPass *createR600LowerConstCopy(TargetMachine &tm);
// SI Passes
FunctionPass *createSIAnnotateControlFlowPass();
-FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 40f4741..1a26c77 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -38,3 +38,4 @@ include "AMDGPUInstrInfo.td"
include "AMDGPUIntrinsics.td"
include "AMDGPURegisterInfo.td"
include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index c30dbe4..f600144 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -141,5 +141,5 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
- OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4);
+ OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
}
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
new file mode 100644
index 0000000..45ae37e
--- /dev/null
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -0,0 +1,42 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+ CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+ ]>>>,
+
+ CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ]
+ >>>,
+
+ CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+ ]>>>
+
+]>;
+
+def CC_AMDGPU : CallingConv<[
+ CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"#
+ "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>>
+]>;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 0a33264..5995b6f 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -14,7 +14,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUISelLowering.h"
+#include "AMDGPURegisterInfo.h"
#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -22,6 +25,8 @@
using namespace llvm;
+#include "AMDGPUGenCallingConv.inc"
+
AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
TargetLowering(TM, new TargetLoweringObjectFileELF()) {
@@ -64,17 +69,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
// TargetLowering Callbacks
//===---------------------------------------------------------------------===//
-SDValue AMDGPUTargetLowering::LowerFormalArguments(
- SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
- for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
- InVals.push_back(SDValue());
- }
- return Chain;
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
+
+ State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
}
SDValue AMDGPUTargetLowering::LowerReturn(
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 9e7d997..f31b646 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -39,15 +39,12 @@ protected:
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
+ void AnalyzeFormalArguments(CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
public:
AMDGPUTargetLowering(TargetMachine &TM);
- virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const;
-
virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
index 15840b3..ed6c8ec 100644
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -289,7 +289,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
// We only need to use REG_SEQUENCE for explicit defs, since the
// register coalescer won't do anything with the implicit defs.
- MachineInstr *DefInstr = MRI.getVRegDef(Reg);
if (!regHasExplicitDef(MRI, Reg)) {
continue;
}
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 960f108..e740348 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -132,13 +132,6 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst <
[(set rc:$dst, (fneg rc:$src0))]
>;
-def SHADER_TYPE : AMDGPUShaderInst <
- (outs),
- (ins i32imm:$type),
- "SHADER_TYPE $type",
- [(int_AMDGPU_shader_type imm:$type)]
->;
-
} // usesCustomInserter = 1
multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
@@ -209,8 +202,8 @@ class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
(vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
>;
-class Vector_Build <ValueType vecType, RegisterClass vectorClass,
- ValueType elemType, RegisterClass elemClass> : Pat <
+class Vector4_Build <ValueType vecType, RegisterClass vectorClass,
+ ValueType elemType, RegisterClass elemClass> : Pat <
(vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
(elemType elemClass:$z), (elemType elemClass:$w))),
(INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 2ba2d4b..eecb25b 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -50,8 +50,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
- def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>;
}
let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
index 26f842e..b723433 100644
--- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp
+++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
@@ -243,6 +243,7 @@ public:
initializeRegionInfoPass(*PassRegistry::getPassRegistry());
}
+ using Pass::doInitialization;
virtual bool doInitialization(Region *R, RGPassManager &RGM);
virtual bool runOnRegion(Region *R, RGPassManager &RGM);
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index e2f00be..0185747 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
#include "AMDGPU.h"
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "llvm/Analysis/Passes.h"
@@ -39,6 +40,14 @@ extern "C" void LLVMInitializeR600Target() {
RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
}
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+ return new ScheduleDAGMI(C, new R600SchedStrategy());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("r600", "Run R600's custom scheduler",
+ createR600MachineScheduler);
+
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
@@ -70,7 +79,13 @@ namespace {
class AMDGPUPassConfig : public TargetPassConfig {
public:
AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+ enablePass(&MachineSchedulerID);
+ MachineSchedRegistry::setDefault(createR600MachineScheduler);
+ }
+ }
AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
return getTM<AMDGPUTargetMachine>();
@@ -112,11 +127,6 @@ bool AMDGPUPassConfig::addInstSelector() {
}
bool AMDGPUPassConfig::addPreRegAlloc() {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-
- if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
- addPass(createSIAssignInterpRegsPass(*TM));
- }
addPass(createAMDGPUConvertToISAPass(*TM));
return false;
}
@@ -143,7 +153,6 @@ bool AMDGPUPassConfig::addPreEmitPass() {
addPass(createAMDGPUCFGStructurizerPass(*TM));
addPass(createR600ExpandSpecialInstrsPass(*TM));
addPass(&FinalizeMachineBundlesID);
- addPass(createR600LowerConstCopy(*TM));
} else {
addPass(createSILowerControlFlowPass(*TM));
}
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
index b39fbdb..39ab664 100644
--- a/lib/Target/R600/AMDIL.h
+++ b/lib/Target/R600/AMDIL.h
@@ -96,24 +96,23 @@ enum AddressSpaces {
ADDRESS_NONE = 5, ///< Address space for unknown memory.
PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
- USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
- CONSTANT_BUFFER_0 = 9,
- CONSTANT_BUFFER_1 = 10,
- CONSTANT_BUFFER_2 = 11,
- CONSTANT_BUFFER_3 = 12,
- CONSTANT_BUFFER_4 = 13,
- CONSTANT_BUFFER_5 = 14,
- CONSTANT_BUFFER_6 = 15,
- CONSTANT_BUFFER_7 = 16,
- CONSTANT_BUFFER_8 = 17,
- CONSTANT_BUFFER_9 = 18,
- CONSTANT_BUFFER_10 = 19,
- CONSTANT_BUFFER_11 = 20,
- CONSTANT_BUFFER_12 = 21,
- CONSTANT_BUFFER_13 = 22,
- CONSTANT_BUFFER_14 = 23,
- CONSTANT_BUFFER_15 = 24,
- LAST_ADDRESS = 25
+ CONSTANT_BUFFER_0 = 8,
+ CONSTANT_BUFFER_1 = 9,
+ CONSTANT_BUFFER_2 = 10,
+ CONSTANT_BUFFER_3 = 11,
+ CONSTANT_BUFFER_4 = 12,
+ CONSTANT_BUFFER_5 = 13,
+ CONSTANT_BUFFER_6 = 14,
+ CONSTANT_BUFFER_7 = 15,
+ CONSTANT_BUFFER_8 = 16,
+ CONSTANT_BUFFER_9 = 17,
+ CONSTANT_BUFFER_10 = 18,
+ CONSTANT_BUFFER_11 = 19,
+ CONSTANT_BUFFER_12 = 20,
+ CONSTANT_BUFFER_13 = 21,
+ CONSTANT_BUFFER_14 = 22,
+ CONSTANT_BUFFER_15 = 23,
+ LAST_ADDRESS = 24
};
} // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index aa8ab6b..b0cd0f9 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -2595,6 +2595,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
static int getBranchNzeroOpcode(int oldOpcode) {
switch(oldOpcode) {
+ case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
case AMDGPU::BRANCH_COND_i32:
case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
@@ -2606,6 +2607,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
static int getBranchZeroOpcode(int oldOpcode) {
switch(oldOpcode) {
+ case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
case AMDGPU::BRANCH_COND_i32:
case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
@@ -2617,6 +2619,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
static int getContinueNzeroOpcode(int oldOpcode) {
switch(oldOpcode) {
+ case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
default:
assert(0 && "internal error");
@@ -2626,6 +2629,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
static int getContinueZeroOpcode(int oldOpcode) {
switch(oldOpcode) {
+ case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
default:
assert(0 && "internal error");
@@ -2654,8 +2658,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
static bool isCondBranch(MachineInstr *instr) {
switch (instr->getOpcode()) {
- case AMDGPU::JUMP:
- return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
+ case AMDGPU::JUMP_COND:
case AMDGPU::BRANCH_COND_i32:
case AMDGPU::BRANCH_COND_f32:
break;
@@ -2668,7 +2671,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
static bool isUncondBranch(MachineInstr *instr) {
switch (instr->getOpcode()) {
case AMDGPU::JUMP:
- return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0;
case AMDGPU::BRANCH:
return true;
default:
diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp
index eec5059..db8e01e 100644
--- a/lib/Target/R600/AMDILDevice.cpp
+++ b/lib/Target/R600/AMDILDevice.cpp
@@ -115,10 +115,18 @@ bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
std::string
AMDGPUDevice::getDataLayout() const {
- return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
- "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
- "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
- "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
- "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
- "-n8:16:32:64");
+ std::string DataLayout = std::string(
+ "e"
+ "-p:32:32:32"
+ "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
+ "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+ "-n32:64"
+ );
+
+ if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) {
+ DataLayout.append("-f64:64:64");
+ }
+
+ return DataLayout;
}
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index e77b9dc..fa8f62d 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -162,6 +162,35 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
switch (Opc) {
default: break;
+ case ISD::BUILD_VECTOR: {
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+ break;
+ }
+ // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+ // that adds a 128 bits reg copy when going through TwoAddressInstructions
+ // pass. We want to avoid 128 bits copies as much as possible because they
+ // can't be bundled by our scheduler.
+ SDValue RegSeqArgs[9] = {
+ CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+ SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+ SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+ SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
+ SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
+ };
+ bool IsRegSeq = true;
+ for (unsigned i = 0; i < N->getNumOperands(); i++) {
+ if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
+ IsRegSeq = false;
+ break;
+ }
+ RegSeqArgs[2 * i + 1] = N->getOperand(i);
+ }
+ if (!IsRegSeq)
+ break;
+ return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+ RegSeqArgs, 2 * N->getNumOperands() + 1);
+ }
case ISD::ConstantFP:
case ISD::Constant: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
@@ -336,17 +365,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
SDValue Operand = Ops[OperandIdx[i] - 1];
switch (Operand.getOpcode()) {
case AMDGPUISD::CONST_ADDRESS: {
- if (i == 2)
- break;
SDValue CstOffset;
- if (!Operand.getValueType().isVector() &&
- SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
- Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
- Ops[SelIdx[i] - 1] = CstOffset;
- return true;
+ if (Operand.getValueType().isVector() ||
+ !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
+ break;
+
+ // Gather others constants values
+ std::vector<unsigned> Consts;
+ for (unsigned j = 0; j < 3; j++) {
+ int SrcIdx = OperandIdx[j];
+ if (SrcIdx < 0)
+ break;
+ if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+ if (Reg->getReg() == AMDGPU::ALU_CONST) {
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+ Consts.push_back(Cst->getZExtValue());
+ }
+ }
}
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+ Consts.push_back(Cst->getZExtValue());
+ if (!TII->fitsConstReadLimitations(Consts))
+ break;
+
+ Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+ Ops[SelIdx[i] - 1] = CstOffset;
+ return true;
}
- break;
case ISD::FNEG:
if (NegIdx[i] < 0)
break;
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index f65e1f3..922cac1 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -33,11 +33,6 @@
using namespace llvm;
//===----------------------------------------------------------------------===//
-// Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-#include "AMDGPUGenCallingConv.inc"
-
-//===----------------------------------------------------------------------===//
// TargetLowering Implementation Help Functions End
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp
index 3096c22..0d1de3d 100644
--- a/lib/Target/R600/AMDILSIDevice.cpp
+++ b/lib/Target/R600/AMDILSIDevice.cpp
@@ -36,10 +36,13 @@ AMDGPUSIDevice::getGeneration() const {
std::string
AMDGPUSIDevice::getDataLayout() const {
- return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
- "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
- "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
- "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
- "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
- "-n8:16:32:64");
+ return std::string(
+ "e"
+ "-p:64:64:64"
+ "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64"
+ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128"
+ "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+ "-v2048:2048:2048"
+ "-n32:64"
+ );
}
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 00f8b10..63c59e1 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -37,11 +37,10 @@ add_llvm_target(R600CodeGen
R600ExpandSpecialInstrs.cpp
R600InstrInfo.cpp
R600ISelLowering.cpp
- R600LowerConstCopy.cpp
R600MachineFunctionInfo.cpp
+ R600MachineScheduler.cpp
R600RegisterInfo.cpp
SIAnnotateControlFlow.cpp
- SIAssignInterpRegs.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 8721f80..cd3a7ce 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -33,15 +33,6 @@ public:
SmallVectorImpl<MCFixup> &Fixups) const {
return 0;
}
-
- virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups) const {
- return 0;
- }
- virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups) const {
- return 0;
- }
};
} // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index 6cc0077..e27abcc 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -42,9 +42,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCSubtargetInfo &STI;
MCContext &Ctx;
- /// \brief Encode a sequence of registers with the correct alignment.
- unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
-
/// \brief Can this operand also contain immediate values?
bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -65,14 +62,6 @@ public:
/// \returns the encoding for an MCOperand.
virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups) const;
-
- /// \brief Encoding for when 2 consecutive registers are used
- virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixup) const;
-
- /// \brief Encoding for when 4 consectuive registers are used
- virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixup) const;
};
} // End anonymous namespace
@@ -212,24 +201,3 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
return 0;
}
-//===----------------------------------------------------------------------===//
-// Custom Operand Encodings
-//===----------------------------------------------------------------------===//
-
-unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
- unsigned shift) const {
- unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg());
- return (regCode & 0xff) >> shift;
-}
-
-unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
- unsigned OpNo ,
- SmallVectorImpl<MCFixup> &Fixup) const {
- return GPRAlign(MI, OpNo, 1);
-}
-
-unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
- unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixup) const {
- return GPRAlign(MI, OpNo, 2);
-}
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index b5c2a93..a73691d 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -50,8 +50,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::UREM, MVT::v4i32, Expand);
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
- setOperationAction(ISD::BR_CC, MVT::i32, Custom);
- setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::FSUB, MVT::f32, Expand);
@@ -65,8 +65,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
- setOperationAction(ISD::SETCC, MVT::i32, Custom);
- setOperationAction(ISD::SETCC, MVT::f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
@@ -94,6 +94,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::SELECT_CC);
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::VLIW);
}
@@ -105,7 +106,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
switch (MI->getOpcode()) {
default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
- case AMDGPU::SHADER_TYPE: break;
case AMDGPU::CLAMP_R600: {
MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
AMDGPU::MOV,
@@ -150,7 +150,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
MI->getOperand(1).getImm());
break;
-
+ case AMDGPU::CONST_COPY: {
+ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
+ MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
+ TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
+ MI->getOperand(1).getImm());
+ break;
+ }
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
@@ -215,8 +221,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::BRANCH:
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
- .addOperand(MI->getOperand(0))
- .addReg(0);
+ .addOperand(MI->getOperand(0));
break;
case AMDGPU::BRANCH_COND_f32: {
@@ -227,7 +232,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
.addImm(OPCODE_IS_NOT_ZERO)
.addImm(0); // Flags
TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
.addOperand(MI->getOperand(0))
.addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
break;
@@ -241,7 +246,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
.addImm(OPCODE_IS_NOT_ZERO_INT)
.addImm(0); // Flags
TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
- BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
.addOperand(MI->getOperand(0))
.addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
break;
@@ -306,11 +311,9 @@ using namespace llvm::AMDGPUIntrinsic;
SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
- case ISD::BR_CC: return LowerBR_CC(Op, DAG);
case ISD::ROTL: return LowerROTL(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
- case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::FPOW: return LowerFPOW(Op, DAG);
@@ -470,44 +473,6 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
);
}
-SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
- SDValue Chain = Op.getOperand(0);
- SDValue CC = Op.getOperand(1);
- SDValue LHS = Op.getOperand(2);
- SDValue RHS = Op.getOperand(3);
- SDValue JumpT = Op.getOperand(4);
- SDValue CmpValue;
- SDValue Result;
-
- if (LHS.getValueType() == MVT::i32) {
- CmpValue = DAG.getNode(
- ISD::SELECT_CC,
- Op.getDebugLoc(),
- MVT::i32,
- LHS, RHS,
- DAG.getConstant(-1, MVT::i32),
- DAG.getConstant(0, MVT::i32),
- CC);
- } else if (LHS.getValueType() == MVT::f32) {
- CmpValue = DAG.getNode(
- ISD::SELECT_CC,
- Op.getDebugLoc(),
- MVT::f32,
- LHS, RHS,
- DAG.getConstantFP(1.0f, MVT::f32),
- DAG.getConstantFP(0.0f, MVT::f32),
- CC);
- } else {
- assert(0 && "Not valid type for br_cc");
- }
- Result = DAG.getNode(
- AMDGPUISD::BRANCH_COND,
- CmpValue.getDebugLoc(),
- MVT::Other, Chain,
- JumpT, CmpValue);
- return Result;
-}
-
SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
DebugLoc DL,
unsigned DwordOffset) const {
@@ -576,12 +541,37 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
// Check if we can lower this to a native operation.
+ // Try to lower to a SET* instruction:
+ //
+ // SET* can match the following patterns:
+ //
+ // select_cc f32, f32, -1, 0, cc_any
+ // select_cc f32, f32, 1.0f, 0.0f, cc_any
+ // select_cc i32, i32, -1, 0, cc_any
+ //
+
+ // Move hardware True/False values to the correct operand.
+ if (isHWTrueValue(False) && isHWFalseValue(True)) {
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+ std::swap(False, True);
+ CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+ }
+
+ if (isHWTrueValue(True) && isHWFalseValue(False) &&
+ (CompareVT == VT || VT == MVT::i32)) {
+ // This can be matched by a SET* instruction.
+ return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+ }
+
// Try to lower to a CND* instruction:
- // CND* instructions requires RHS to be zero. Some SELECT_CC nodes that
- // can be lowered to CND* instructions can also be lowered to SET*
- // instructions. CND* instructions are cheaper, because they dont't
- // require additional instructions to convert their result to the correct
- // value type, so this check should be first.
+ //
+ // CND* can match the following patterns:
+ //
+ // select_cc f32, 0.0, f32, f32, cc_any
+ // select_cc f32, 0.0, i32, i32, cc_any
+ // select_cc i32, 0, f32, f32, cc_any
+ // select_cc i32, 0, i32, i32, cc_any
+ //
if (isZero(LHS) || isZero(RHS)) {
SDValue Cond = (isZero(LHS) ? RHS : LHS);
SDValue Zero = (isZero(LHS) ? LHS : RHS);
@@ -623,38 +613,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
}
- // Try to lower to a SET* instruction:
- //
- // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware,
- // but for the other case where CompareVT != VT, all operands of
- // SELECT_CC need to have the same value type, so we need to change True and
- // False to be the same type as LHS and RHS, and then convert the result of
- // the select_cc back to the correct type.
-
- // Move hardware True/False values to the correct operand.
- if (isHWTrueValue(False) && isHWFalseValue(True)) {
- ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
- std::swap(False, True);
- CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
- }
-
- if (isHWTrueValue(True) && isHWFalseValue(False)) {
- if (CompareVT != VT && VT == MVT::f32 && CompareVT == MVT::i32) {
- SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
- LHS, RHS,
- DAG.getConstant(-1, MVT::i32),
- DAG.getConstant(0, MVT::i32),
- CC);
- // Convert integer values of true (-1) and false (0) to fp values of
- // true (1.0f) and false (0.0f).
- SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
- DAG.getConstant(1, MVT::i32));
- return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
- } else {
- // This SELECT_CC is already legal.
- return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
- }
- }
// Possible Min/Max pattern
SDValue MinMax = LowerMinMax(Op, DAG);
@@ -698,48 +656,6 @@ SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getCondCode(ISD::SETNE));
}
-SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- SDValue Cond;
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue CC = Op.getOperand(2);
- DebugLoc DL = Op.getDebugLoc();
- assert(Op.getValueType() == MVT::i32);
- if (LHS.getValueType() == MVT::i32) {
- Cond = DAG.getNode(
- ISD::SELECT_CC,
- Op.getDebugLoc(),
- MVT::i32,
- LHS, RHS,
- DAG.getConstant(-1, MVT::i32),
- DAG.getConstant(0, MVT::i32),
- CC);
- } else if (LHS.getValueType() == MVT::f32) {
- Cond = DAG.getNode(
- ISD::SELECT_CC,
- Op.getDebugLoc(),
- MVT::f32,
- LHS, RHS,
- DAG.getConstantFP(1.0f, MVT::f32),
- DAG.getConstantFP(0.0f, MVT::f32),
- CC);
- Cond = DAG.getNode(
- ISD::FP_TO_SINT,
- DL,
- MVT::i32,
- Cond);
- } else {
- assert(0 && "Not valid type for set_cc");
- }
- Cond = DAG.getNode(
- ISD::AND,
- DL,
- MVT::i32,
- DAG.getConstant(1, MVT::i32),
- Cond);
- return Cond;
-}
-
/// LLVM generates byte-addresed pointers. For indirect addressing, we need to
/// convert these pointers to a register index. Each register holds
/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
@@ -918,7 +834,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
if (ConstantBlock > -1) {
SDValue Result;
if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
- dyn_cast<Constant>(LoadNode->getSrcValue())) {
+ dyn_cast<Constant>(LoadNode->getSrcValue()) ||
+ dyn_cast<ConstantSDNode>(Ptr)) {
SDValue Slots[4];
for (unsigned i = 0; i < 4; i++) {
// We want Const position encoded with the following formula :
@@ -934,7 +851,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
} else {
// non constant ptr cant be folded, keeps it as a v4f32 load
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
- DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
+ DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
+ DAG.getConstant(LoadNode->getAddressSpace() -
+ AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
);
}
@@ -1122,6 +1041,9 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SELECT_CC: {
// fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
// selectcc x, y, a, b, inv(cc)
+ //
+ // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+ // selectcc x, y, a, b, cc
SDValue LHS = N->getOperand(0);
if (LHS.getOpcode() != ISD::SELECT_CC) {
return SDValue();
@@ -1130,24 +1052,30 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
SDValue True = N->getOperand(2);
SDValue False = N->getOperand(3);
+ ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
if (LHS.getOperand(2).getNode() != True.getNode() ||
LHS.getOperand(3).getNode() != False.getNode() ||
- RHS.getNode() != False.getNode() ||
- cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) {
+ RHS.getNode() != False.getNode()) {
return SDValue();
}
- ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get();
- CCOpcode = ISD::getSetCCInverse(
- CCOpcode, LHS.getOperand(0).getValueType().isInteger());
- return DAG.getSelectCC(N->getDebugLoc(),
- LHS.getOperand(0),
- LHS.getOperand(1),
- LHS.getOperand(2),
- LHS.getOperand(3),
- CCOpcode);
+ switch (NCC) {
+ default: return SDValue();
+ case ISD::SETNE: return LHS;
+ case ISD::SETEQ: {
+ ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+ LHSCC = ISD::getSetCCInverse(LHSCC,
+ LHS.getOperand(0).getValueType().isInteger());
+ return DAG.getSelectCC(N->getDebugLoc(),
+ LHS.getOperand(0),
+ LHS.getOperand(1),
+ LHS.getOperand(2),
+ LHS.getOperand(3),
+ LHSCC);
}
+ }
+ }
case AMDGPUISD::EXPORT: {
SDValue Arg = N->getOperand(1);
if (Arg.getOpcode() != ISD::BUILD_VECTOR)
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index afa3897..5cb4b91 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -52,14 +52,11 @@ private:
void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
MachineRegisterInfo & MRI, unsigned dword_offset) const;
- SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-
/// \brief Lower ROTL opcode to BITALIGN
SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 7e3f005..0865098 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
(TargetFlags & R600_InstFlag::OP3));
}
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+ const {
+ assert (Consts.size() <= 12 && "Too many operands in instructions group");
+ unsigned Pair1 = 0, Pair2 = 0;
+ for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+ unsigned ReadConstHalf = Consts[i] & 2;
+ unsigned ReadConstIndex = Consts[i] & (~3);
+ unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+ if (!Pair1) {
+ Pair1 = ReadHalfConst;
+ continue;
+ }
+ if (Pair1 == ReadHalfConst)
+ continue;
+ if (!Pair2) {
+ Pair2 = ReadHalfConst;
+ continue;
+ }
+ if (Pair2 != ReadHalfConst)
+ return false;
+ }
+ return true;
+}
+
+bool
+R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+ std::vector<unsigned> Consts;
+ for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+ const MachineInstr *MI = MIs[i];
+
+ const R600Operands::Ops OpTable[3][2] = {
+ {R600Operands::SRC0, R600Operands::SRC0_SEL},
+ {R600Operands::SRC1, R600Operands::SRC1_SEL},
+ {R600Operands::SRC2, R600Operands::SRC2_SEL},
+ };
+
+ if (!isALUInstr(MI->getOpcode()))
+ continue;
+
+ for (unsigned j = 0; j < 3; j++) {
+ int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+ if (SrcIdx < 0)
+ break;
+ if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
+ unsigned Const = MI->getOperand(
+ getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+ Consts.push_back(Const);
+ }
+ }
+ }
+ return fitsConstReadLimitations(Consts);
+}
+
DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
const ScheduleDAG *DAG) const {
const InstrItineraryData *II = TM->getInstrItineraryData();
@@ -168,6 +222,11 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
return NULL;
}
+static
+bool isJump(unsigned Opcode) {
+ return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
bool
R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&TBB,
@@ -186,7 +245,7 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
return false;
--I;
}
- if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) {
+ if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
return false;
}
@@ -196,22 +255,20 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst->getOpcode();
if (I == MBB.begin() ||
- static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) {
+ !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
if (LastOpc == AMDGPU::JUMP) {
- if(!isPredicated(LastInst)) {
- TBB = LastInst->getOperand(0).getMBB();
- return false;
- } else {
- MachineInstr *predSet = I;
- while (!isPredicateSetter(predSet->getOpcode())) {
- predSet = --I;
- }
- TBB = LastInst->getOperand(0).getMBB();
- Cond.push_back(predSet->getOperand(1));
- Cond.push_back(predSet->getOperand(2));
- Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
- return false;
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ } else if (LastOpc == AMDGPU::JUMP_COND) {
+ MachineInstr *predSet = I;
+ while (!isPredicateSetter(predSet->getOpcode())) {
+ predSet = --I;
}
+ TBB = LastInst->getOperand(0).getMBB();
+ Cond.push_back(predSet->getOperand(1));
+ Cond.push_back(predSet->getOperand(2));
+ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+ return false;
}
return true; // Can't handle indirect branch.
}
@@ -221,10 +278,7 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
unsigned SecondLastOpc = SecondLastInst->getOpcode();
// If the block ends with a B and a Bcc, handle it.
- if (SecondLastOpc == AMDGPU::JUMP &&
- isPredicated(SecondLastInst) &&
- LastOpc == AMDGPU::JUMP &&
- !isPredicated(LastInst)) {
+ if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
MachineInstr *predSet = --I;
while (!isPredicateSetter(predSet->getOpcode())) {
predSet = --I;
@@ -261,7 +315,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
if (FBB == 0) {
if (Cond.empty()) {
- BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0);
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
return 1;
} else {
MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
@@ -269,7 +323,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
addFlag(PredSet, 0, MO_FLAG_PUSH);
PredSet->getOperand(2).setImm(Cond[1].getImm());
- BuildMI(&MBB, DL, get(AMDGPU::JUMP))
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
.addMBB(TBB)
.addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
return 1;
@@ -279,10 +333,10 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
assert(PredSet && "No previous predicate !");
addFlag(PredSet, 0, MO_FLAG_PUSH);
PredSet->getOperand(2).setImm(Cond[1].getImm());
- BuildMI(&MBB, DL, get(AMDGPU::JUMP))
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
.addMBB(TBB)
.addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
- BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0);
+ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
return 2;
}
}
@@ -302,11 +356,13 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
switch (I->getOpcode()) {
default:
return 0;
+ case AMDGPU::JUMP_COND: {
+ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+ clearFlag(predSet, 0, MO_FLAG_PUSH);
+ I->eraseFromParent();
+ break;
+ }
case AMDGPU::JUMP:
- if (isPredicated(I)) {
- MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
- clearFlag(predSet, 0, MO_FLAG_PUSH);
- }
I->eraseFromParent();
break;
}
@@ -320,11 +376,13 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
// FIXME: only one case??
default:
return 1;
+ case AMDGPU::JUMP_COND: {
+ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+ clearFlag(predSet, 0, MO_FLAG_PUSH);
+ I->eraseFromParent();
+ break;
+ }
case AMDGPU::JUMP:
- if (isPredicated(I)) {
- MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
- clearFlag(predSet, 0, MO_FLAG_PUSH);
- }
I->eraseFromParent();
break;
}
@@ -356,6 +414,8 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const {
if (MI->getOpcode() == AMDGPU::KILLGT) {
return false;
+ } else if (isVector(*MI)) {
+ return false;
} else {
return AMDGPUInstrInfo::isPredicable(MI);
}
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index efe721c..bf9569e 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -53,6 +53,9 @@ namespace llvm {
/// \returns true if this \p Opcode represents an ALU instruction.
bool isALUInstr(unsigned Opcode) const;
+ bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+ bool canBundle(const std::vector<MachineInstr *> &) const;
+
/// \breif Vector instructions are instructions that must fill all
/// instruction slots within an instruction group.
bool isVector(const MachineInstr &MI) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8242df9..8c50d54 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -512,8 +512,8 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst <
[]>;
def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
- SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
- [SDNPMayLoad]
+ SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+ [SDNPVariadic]
>;
//===----------------------------------------------------------------------===//
@@ -1090,12 +1090,12 @@ class COS_Common <bits<11> inst> : R600_1OP <
multiclass DIV_Common <InstR600 recip_ieee> {
def : Pat<
(int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
- (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+ (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
>;
def : Pat<
(fdiv R600_Reg32:$src0, R600_Reg32:$src1),
- (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+ (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
>;
}
@@ -1169,12 +1169,12 @@ let Predicates = [isR600] in {
// cards.
class COS_PAT <InstR600 trig> : Pat<
(fcos R600_Reg32:$src),
- (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+ (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
>;
class SIN_PAT <InstR600 trig> : Pat<
(fsin R600_Reg32:$src),
- (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+ (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
>;
//===----------------------------------------------------------------------===//
@@ -1587,19 +1587,28 @@ def PRED_X : InstR600 <
(ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
"", [], NullALU> {
let FlagOperandIdx = 3;
- let isTerminator = 1;
}
-let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
-
-def JUMP : InstR600 <0x10,
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <0x10,
(outs),
- (ins brtarget:$target, R600_Pred:$p),
+ (ins brtarget:$target, R600_Predicate_Bit:$p),
"JUMP $target ($p)",
[], AnyALU
>;
-} // End isTerminator = 1, isBranch = 1, isBarrier = 1
+def JUMP : InstR600 <0x10,
+ (outs),
+ (ins brtarget:$target),
+ "JUMP $target",
+ [], AnyALU
+ >
+{
+ let isPredicable = 1;
+ let isBarrier = 1;
+}
+
+} // End isTerminator = 1, isBranch = 1
let usesCustomInserter = 1 in {
@@ -1639,7 +1648,7 @@ def FNEG_R600 : FNEG<R600_Reg32>;
//===---------------------------------------------------------------------===//
// Return instruction
//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1,
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
usesCustomInserter = 1 in {
def RETURN : ILFormat<(outs), (ins variable_ops),
"RETURN", [(IL_retflag)]>;
@@ -1650,27 +1659,27 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1,
// Constant Buffer Addressing Support
//===----------------------------------------------------------------------===//
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
def CONST_COPY : Instruction {
let OutOperandList = (outs R600_Reg32:$dst);
let InOperandList = (ins i32imm:$src);
- let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+ let Pattern =
+ [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
let AsmString = "CONST_COPY";
let neverHasSideEffects = 1;
let isAsCheapAsAMove = 1;
let Itinerary = NullALU;
}
-} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
def TEX_VTX_CONSTBUF :
- InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
- [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
+ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
+ [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
VTX_WORD1_GPR, VTX_WORD0 {
let VC_INST = 0;
let FETCH_TYPE = 2;
let FETCH_WHOLE_QUAD = 0;
- let BUFFER_ID = 0;
let SRC_REL = 0;
let SRC_SEL_X = 0;
let DST_REL = 0;
@@ -1840,6 +1849,18 @@ let isTerminator=1 in {
// ISel Patterns
//===----------------------------------------------------------------------===//
+// CND*_INT Pattterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+ (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1),
+ R600_Reg32:$src2, cc),
+ (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT, SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
//CNDGE_INT extra pattern
def : Pat <
(selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
@@ -1958,8 +1979,8 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
-def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
-def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
+def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
// bitconvert patterns
diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp
deleted file mode 100644
index 3ebe653..0000000
--- a/lib/Target/R600/R600LowerConstCopy.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
-/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
-/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
-/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
-/// to fold them if possible or replace them by MOV otherwise.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/GlobalValue.h"
-
-namespace llvm {
-
-class R600LowerConstCopy : public MachineFunctionPass {
-private:
- static char ID;
- const R600InstrInfo *TII;
-
- struct ConstPairs {
- unsigned XYPair;
- unsigned ZWPair;
- };
-
- bool canFoldInBundle(ConstPairs &UsedConst, unsigned ReadConst) const;
-public:
- R600LowerConstCopy(TargetMachine &tm);
- virtual bool runOnMachineFunction(MachineFunction &MF);
-
- const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
-};
-
-char R600LowerConstCopy::ID = 0;
-
-R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
- MachineFunctionPass(ID),
- TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
-{
-}
-
-bool R600LowerConstCopy::canFoldInBundle(ConstPairs &UsedConst,
- unsigned ReadConst) const {
- unsigned ReadConstChan = ReadConst & 3;
- unsigned ReadConstIndex = ReadConst & (~3);
- if (ReadConstChan < 2) {
- if (!UsedConst.XYPair) {
- UsedConst.XYPair = ReadConstIndex;
- }
- return UsedConst.XYPair == ReadConstIndex;
- } else {
- if (!UsedConst.ZWPair) {
- UsedConst.ZWPair = ReadConstIndex;
- }
- return UsedConst.ZWPair == ReadConstIndex;
- }
-}
-
-static bool isControlFlow(const MachineInstr &MI) {
- return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) ||
- (MI.getOpcode() == AMDGPU::ENDIF) ||
- (MI.getOpcode() == AMDGPU::ELSE) ||
- (MI.getOpcode() == AMDGPU::WHILELOOP) ||
- (MI.getOpcode() == AMDGPU::BREAK);
-}
-
-bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
-
- for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
- BB != BB_E; ++BB) {
- MachineBasicBlock &MBB = *BB;
- DenseMap<unsigned, MachineInstr *> RegToConstIndex;
- for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(),
- E = MBB.instr_end(); I != E;) {
-
- if (I->getOpcode() == AMDGPU::CONST_COPY) {
- MachineInstr &MI = *I;
- I = llvm::next(I);
- unsigned DstReg = MI.getOperand(0).getReg();
- DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
- RegToConstIndex.find(DstReg);
- if (SrcMI != RegToConstIndex.end()) {
- SrcMI->second->eraseFromParent();
- RegToConstIndex.erase(SrcMI);
- }
- MachineInstr *NewMI =
- TII->buildDefaultInstruction(MBB, &MI, AMDGPU::MOV,
- MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
- TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
- MI.getOperand(1).getImm());
- RegToConstIndex[DstReg] = NewMI;
- MI.eraseFromParent();
- continue;
- }
-
- std::vector<unsigned> Defs;
- // We consider all Instructions as bundled because algorithm that handle
- // const read port limitations inside an IG is still valid with single
- // instructions.
- std::vector<MachineInstr *> Bundle;
-
- if (I->isBundle()) {
- unsigned BundleSize = I->getBundleSize();
- for (unsigned i = 0; i < BundleSize; i++) {
- I = llvm::next(I);
- Bundle.push_back(I);
- }
- } else if (TII->isALUInstr(I->getOpcode())){
- Bundle.push_back(I);
- } else if (isControlFlow(*I)) {
- RegToConstIndex.clear();
- I = llvm::next(I);
- continue;
- } else {
- MachineInstr &MI = *I;
- for (MachineInstr::mop_iterator MOp = MI.operands_begin(),
- MOpE = MI.operands_end(); MOp != MOpE; ++MOp) {
- MachineOperand &MO = *MOp;
- if (!MO.isReg())
- continue;
- if (MO.isDef()) {
- Defs.push_back(MO.getReg());
- } else {
- // Either a TEX or an Export inst, prevent from erasing def of used
- // operand
- RegToConstIndex.erase(MO.getReg());
- for (MCSubRegIterator SR(MO.getReg(), &TII->getRegisterInfo());
- SR.isValid(); ++SR) {
- RegToConstIndex.erase(*SR);
- }
- }
- }
- }
-
-
- R600Operands::Ops OpTable[3][2] = {
- {R600Operands::SRC0, R600Operands::SRC0_SEL},
- {R600Operands::SRC1, R600Operands::SRC1_SEL},
- {R600Operands::SRC2, R600Operands::SRC2_SEL},
- };
-
- for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
- ItE = Bundle.end(); It != ItE; ++It) {
- MachineInstr *MI = *It;
- if (TII->isPredicated(MI)) {
- // We don't want to erase previous assignment
- RegToConstIndex.erase(MI->getOperand(0).getReg());
- } else {
- int WriteIDX = TII->getOperandIdx(MI->getOpcode(), R600Operands::WRITE);
- if (WriteIDX < 0 || MI->getOperand(WriteIDX).getImm())
- Defs.push_back(MI->getOperand(0).getReg());
- }
- }
-
- ConstPairs CP = {0,0};
- for (unsigned SrcOp = 0; SrcOp < 3; SrcOp++) {
- for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
- ItE = Bundle.end(); It != ItE; ++It) {
- MachineInstr *MI = *It;
- int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[SrcOp][0]);
- if (SrcIdx < 0)
- continue;
- MachineOperand &MO = MI->getOperand(SrcIdx);
- DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
- RegToConstIndex.find(MO.getReg());
- if (SrcMI != RegToConstIndex.end()) {
- MachineInstr *CstMov = SrcMI->second;
- int ConstMovSel =
- TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL);
- unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm();
- if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) {
- TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex);
- MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST);
- } else {
- RegToConstIndex.erase(SrcMI);
- }
- }
- }
- }
-
- for (std::vector<unsigned>::iterator It = Defs.begin(), ItE = Defs.end();
- It != ItE; ++It) {
- DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
- RegToConstIndex.find(*It);
- if (SrcMI != RegToConstIndex.end()) {
- SrcMI->second->eraseFromParent();
- RegToConstIndex.erase(SrcMI);
- }
- }
- I = llvm::next(I);
- }
-
- if (MBB.succ_empty()) {
- for (DenseMap<unsigned, MachineInstr *>::iterator
- DI = RegToConstIndex.begin(), DE = RegToConstIndex.end();
- DI != DE; ++DI) {
- DI->second->eraseFromParent();
- }
- }
- }
- return false;
-}
-
-FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
- return new R600LowerConstCopy(tm);
-}
-
-}
-
-
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
index 40aec83..b07a585 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -14,5 +14,4 @@ using namespace llvm;
R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
: MachineFunctionInfo() {
- memset(Outputs, 0, sizeof(Outputs));
}
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index 4b901f4..13a46b8 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -26,7 +26,6 @@ public:
R600MachineFunctionInfo(const MachineFunction &MF);
SmallVector<unsigned, 4> LiveOuts;
std::vector<unsigned> IndirectRegs;
- SDNode *Outputs[16];
};
} // End llvm namespace
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
new file mode 100644
index 0000000..9074364
--- /dev/null
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -0,0 +1,427 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "R600MachineScheduler.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+
+using namespace llvm;
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+
+ DAG = dag;
+ TII = static_cast<const R600InstrInfo*>(DAG->TII);
+ TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+ MRI = &DAG->MRI;
+ Available[IDAlu]->clear();
+ Available[IDFetch]->clear();
+ Available[IDOther]->clear();
+ CurInstKind = IDOther;
+ CurEmitted = 0;
+ OccupedSlotsMask = 15;
+ InstKindLimit[IDAlu] = 120; // 120 minus 8 for security
+
+
+ const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {
+ InstKindLimit[IDFetch] = 7; // 8 minus 1 for security
+ } else {
+ InstKindLimit[IDFetch] = 15; // 16 minus 1 for security
+ }
+}
+
+void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
+{
+ if (QSrc->empty())
+ return;
+ for (ReadyQueue::iterator I = QSrc->begin(),
+ E = QSrc->end(); I != E; ++I) {
+ (*I)->NodeQueueId &= ~QSrc->getID();
+ QDst->push(*I);
+ }
+ QSrc->clear();
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+ SUnit *SU = 0;
+ IsTopNode = true;
+ NextInstKind = IDOther;
+
+ // check if we might want to switch current clause type
+ bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
+ (CurEmitted > InstKindLimit[CurInstKind]) ||
+ (Available[CurInstKind]->empty());
+ bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
+ (!Available[IDFetch]->empty() || !Available[IDOther]->empty());
+
+ if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+ (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
+ // try to pick ALU
+ SU = pickAlu();
+ if (SU) {
+ if (CurEmitted > InstKindLimit[IDAlu])
+ CurEmitted = 0;
+ NextInstKind = IDAlu;
+ }
+ }
+
+ if (!SU) {
+ // try to pick FETCH
+ SU = pickOther(IDFetch);
+ if (SU)
+ NextInstKind = IDFetch;
+ }
+
+ // try to pick other
+ if (!SU) {
+ SU = pickOther(IDOther);
+ if (SU)
+ NextInstKind = IDOther;
+ }
+
+ DEBUG(
+ if (SU) {
+ dbgs() << "picked node: ";
+ SU->dump(DAG);
+ } else {
+ dbgs() << "NO NODE ";
+ for (int i = 0; i < IDLast; ++i) {
+ Available[i]->dump();
+ Pending[i]->dump();
+ }
+ for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+ const SUnit &S = DAG->SUnits[i];
+ if (!S.isScheduled)
+ S.dump(DAG);
+ }
+ }
+ );
+
+ return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+
+ DEBUG(dbgs() << "scheduled: ");
+ DEBUG(SU->dump(DAG));
+
+ if (NextInstKind != CurInstKind) {
+ DEBUG(dbgs() << "Instruction Type Switch\n");
+ if (NextInstKind != IDAlu)
+ OccupedSlotsMask = 15;
+ CurEmitted = 0;
+ CurInstKind = NextInstKind;
+ }
+
+ if (CurInstKind == IDAlu) {
+ switch (getAluKind(SU)) {
+ case AluT_XYZW:
+ CurEmitted += 4;
+ break;
+ case AluDiscarded:
+ break;
+ default: {
+ ++CurEmitted;
+ for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+ E = SU->getInstr()->operands_end(); It != E; ++It) {
+ MachineOperand &MO = *It;
+ if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+ ++CurEmitted;
+ }
+ }
+ }
+ } else {
+ ++CurEmitted;
+ }
+
+
+ DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+ if (CurInstKind != IDFetch) {
+ MoveUnits(Pending[IDFetch], Available[IDFetch]);
+ }
+ MoveUnits(Pending[IDOther], Available[IDOther]);
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+ int IK = getInstKind(SU);
+
+ DEBUG(dbgs() << IK << " <= ");
+ DEBUG(SU->dump(DAG));
+
+ Pending[IK]->push(SU);
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+ const TargetRegisterClass *RC) const {
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ return RC->contains(Reg);
+ } else {
+ return MRI->getRegClass(Reg) == RC;
+ }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+ MachineInstr *MI = SU->getInstr();
+
+ switch (MI->getOpcode()) {
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ return AluT_XYZW;
+ case AMDGPU::COPY:
+ if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+ // %vregX = COPY Tn_X is likely to be discarded in favor of an
+ // assignement of Tn_X to %vregX, don't considers it in scheduling
+ return AluDiscarded;
+ }
+ else if (MI->getOperand(1).isUndef()) {
+ // MI will become a KILL, don't considers it in scheduling
+ return AluDiscarded;
+ }
+ default:
+ break;
+ }
+
+ // Does the instruction take a whole IG ?
+ if(TII->isVector(*MI) ||
+ TII->isCubeOp(MI->getOpcode()) ||
+ TII->isReductionOp(MI->getOpcode()))
+ return AluT_XYZW;
+
+ // Is the result already assigned to a channel ?
+ unsigned DestSubReg = MI->getOperand(0).getSubReg();
+ switch (DestSubReg) {
+ case AMDGPU::sub0:
+ return AluT_X;
+ case AMDGPU::sub1:
+ return AluT_Y;
+ case AMDGPU::sub2:
+ return AluT_Z;
+ case AMDGPU::sub3:
+ return AluT_W;
+ default:
+ break;
+ }
+
+ // Is the result already member of a X/Y/Z/W class ?
+ unsigned DestReg = MI->getOperand(0).getReg();
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+ regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+ return AluT_X;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+ return AluT_Y;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+ return AluT_Z;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+ return AluT_W;
+ if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+ return AluT_XYZW;
+
+ return AluAny;
+
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+ int Opcode = SU->getInstr()->getOpcode();
+
+ if (TII->isALUInstr(Opcode)) {
+ return IDAlu;
+ }
+
+ switch (Opcode) {
+ case AMDGPU::COPY:
+ case AMDGPU::CONST_COPY:
+ case AMDGPU::INTERP_PAIR_XY:
+ case AMDGPU::INTERP_PAIR_ZW:
+ case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::DOT4_eg_pseudo:
+ case AMDGPU::DOT4_r600_pseudo:
+ return IDAlu;
+ case AMDGPU::TEX_VTX_CONSTBUF:
+ case AMDGPU::TEX_VTX_TEXBUF:
+ case AMDGPU::TEX_LD:
+ case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+ case AMDGPU::TEX_GET_GRADIENTS_H:
+ case AMDGPU::TEX_GET_GRADIENTS_V:
+ case AMDGPU::TEX_SET_GRADIENTS_H:
+ case AMDGPU::TEX_SET_GRADIENTS_V:
+ case AMDGPU::TEX_SAMPLE:
+ case AMDGPU::TEX_SAMPLE_C:
+ case AMDGPU::TEX_SAMPLE_L:
+ case AMDGPU::TEX_SAMPLE_C_L:
+ case AMDGPU::TEX_SAMPLE_LB:
+ case AMDGPU::TEX_SAMPLE_C_LB:
+ case AMDGPU::TEX_SAMPLE_G:
+ case AMDGPU::TEX_SAMPLE_C_G:
+ case AMDGPU::TXD:
+ case AMDGPU::TXD_SHADOW:
+ return IDFetch;
+ default:
+ DEBUG(
+ dbgs() << "other inst: ";
+ SU->dump(DAG);
+ );
+ return IDOther;
+ }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
+ if (Q.empty())
+ return NULL;
+ for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();
+ It != E; ++It) {
+ SUnit *SU = *It;
+ InstructionsGroupCandidate.push_back(SU->getInstr());
+ if (TII->canBundle(InstructionsGroupCandidate)) {
+ InstructionsGroupCandidate.pop_back();
+ Q.erase(It);
+ return SU;
+ } else {
+ InstructionsGroupCandidate.pop_back();
+ }
+ }
+ return NULL;
+}
+
+void R600SchedStrategy::LoadAlu() {
+ ReadyQueue *QSrc = Pending[IDAlu];
+ for (ReadyQueue::iterator I = QSrc->begin(),
+ E = QSrc->end(); I != E; ++I) {
+ (*I)->NodeQueueId &= ~QSrc->getID();
+ AluKind AK = getAluKind(*I);
+ AvailableAlus[AK].insert(*I);
+ }
+ QSrc->clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+ DEBUG(dbgs() << "New Slot\n");
+ assert (OccupedSlotsMask && "Slot wasn't filled");
+ OccupedSlotsMask = 0;
+ InstructionsGroupCandidate.clear();
+ LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+ unsigned DestReg = MI->getOperand(0).getReg();
+ // PressureRegister crashes if an operand is def and used in the same inst
+ // and we try to constraint its regclass
+ for (MachineInstr::mop_iterator It = MI->operands_begin(),
+ E = MI->operands_end(); It != E; ++It) {
+ MachineOperand &MO = *It;
+ if (MO.isReg() && !MO.isDef() &&
+ MO.getReg() == MI->getOperand(0).getReg())
+ return;
+ }
+ // Constrains the regclass of DestReg to assign it to Slot
+ switch (Slot) {
+ case 0:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+ break;
+ case 1:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+ break;
+ case 2:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+ break;
+ case 3:
+ MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+ break;
+ }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+ static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+ SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
+ SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+ if (!UnslotedSU) {
+ return SlotedSU;
+ } else if (!SlotedSU) {
+ AssignSlot(UnslotedSU->getInstr(), Slot);
+ return UnslotedSU;
+ } else {
+ //Determine which one to pick (the lesser one)
+ if (CompareSUnit()(SlotedSU, UnslotedSU)) {
+ AvailableAlus[AluAny].insert(UnslotedSU);
+ return SlotedSU;
+ } else {
+ AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
+ AssignSlot(UnslotedSU->getInstr(), Slot);
+ return UnslotedSU;
+ }
+ }
+}
+
+bool R600SchedStrategy::isAvailablesAluEmpty() const {
+ return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&
+ AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
+ AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
+ AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+ while (!isAvailablesAluEmpty()) {
+ if (!OccupedSlotsMask) {
+ // Flush physical reg copies (RA will discard them)
+ if (!AvailableAlus[AluDiscarded].empty()) {
+ OccupedSlotsMask = 15;
+ return PopInst(AvailableAlus[AluDiscarded]);
+ }
+ // If there is a T_XYZW alu available, use it
+ if (!AvailableAlus[AluT_XYZW].empty()) {
+ OccupedSlotsMask = 15;
+ return PopInst(AvailableAlus[AluT_XYZW]);
+ }
+ }
+ for (unsigned Chan = 0; Chan < 4; ++Chan) {
+ bool isOccupied = OccupedSlotsMask & (1 << Chan);
+ if (!isOccupied) {
+ SUnit *SU = AttemptFillSlot(Chan);
+ if (SU) {
+ OccupedSlotsMask |= (1 << Chan);
+ InstructionsGroupCandidate.push_back(SU->getInstr());
+ return SU;
+ }
+ }
+ }
+ PrepareNextSlot();
+ }
+ return NULL;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+ SUnit *SU = 0;
+ ReadyQueue *AQ = Available[QID];
+
+ if (AQ->empty()) {
+ MoveUnits(Pending[QID], AQ);
+ }
+ if (!AQ->empty()) {
+ SU = *AQ->begin();
+ AQ->remove(AQ->begin());
+ }
+ return SU;
+}
+
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
new file mode 100644
index 0000000..3d0367f
--- /dev/null
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -0,0 +1,120 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINESCHEDULER_H_
+#define R600MACHINESCHEDULER_H_
+
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/PriorityQueue.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class CompareSUnit {
+public:
+ bool operator()(const SUnit *S1, const SUnit *S2) {
+ return S1->getDepth() > S2->getDepth();
+ }
+};
+
+class R600SchedStrategy : public MachineSchedStrategy {
+
+ const ScheduleDAGMI *DAG;
+ const R600InstrInfo *TII;
+ const R600RegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+ enum InstQueue {
+ QAlu = 1,
+ QFetch = 2,
+ QOther = 4
+ };
+
+ enum InstKind {
+ IDAlu,
+ IDFetch,
+ IDOther,
+ IDLast
+ };
+
+ enum AluKind {
+ AluAny,
+ AluT_X,
+ AluT_Y,
+ AluT_Z,
+ AluT_W,
+ AluT_XYZW,
+ AluDiscarded, // LLVM Instructions that are going to be eliminated
+ AluLast
+ };
+
+ ReadyQueue *Available[IDLast], *Pending[IDLast];
+ std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast];
+
+ InstKind CurInstKind;
+ int CurEmitted;
+ InstKind NextInstKind;
+
+ int InstKindLimit[IDLast];
+
+ int OccupedSlotsMask;
+
+public:
+ R600SchedStrategy() :
+ DAG(0), TII(0), TRI(0), MRI(0) {
+ Available[IDAlu] = new ReadyQueue(QAlu, "AAlu");
+ Available[IDFetch] = new ReadyQueue(QFetch, "AFetch");
+ Available[IDOther] = new ReadyQueue(QOther, "AOther");
+ Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu");
+ Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch");
+ Pending[IDOther] = new ReadyQueue(QOther<<4, "POther");
+ }
+
+ virtual ~R600SchedStrategy() {
+ for (unsigned I = 0; I < IDLast; ++I) {
+ delete Available[I];
+ delete Pending[I];
+ }
+ }
+
+ virtual void initialize(ScheduleDAGMI *dag);
+ virtual SUnit *pickNode(bool &IsTopNode);
+ virtual void schedNode(SUnit *SU, bool IsTopNode);
+ virtual void releaseTopNode(SUnit *SU);
+ virtual void releaseBottomNode(SUnit *SU);
+
+private:
+ std::vector<MachineInstr *> InstructionsGroupCandidate;
+
+ int getInstKind(SUnit *SU);
+ bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+ AluKind getAluKind(SUnit *SU) const;
+ void LoadAlu();
+ bool isAvailablesAluEmpty() const;
+ SUnit *AttemptFillSlot (unsigned Slot);
+ void PrepareNextSlot();
+ SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q);
+
+ void AssignSlot(MachineInstr *MI, unsigned Slot);
+ SUnit* pickAlu();
+ SUnit* pickOther(int QID);
+ void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
+};
+
+} // namespace llvm
+
+#endif /* R600MACHINESCHEDULER_H_ */
diff --git a/lib/Target/R600/SIAssignInterpRegs.cpp b/lib/Target/R600/SIAssignInterpRegs.cpp
deleted file mode 100644
index 832e44d..0000000
--- a/lib/Target/R600/SIAssignInterpRegs.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass maps the pseudo interpolation registers to the correct physical
-/// registers.
-//
-/// Prior to executing a fragment shader, the GPU loads interpolation
-/// parameters into physical registers. The specific physical register that each
-/// interpolation parameter ends up in depends on the type of the interpolation
-/// parameter as well as how many interpolation parameters are used by the
-/// shader.
-//
-//===----------------------------------------------------------------------===//
-
-
-
-#include "AMDGPU.h"
-#include "AMDIL.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIAssignInterpRegsPass : public MachineFunctionPass {
-
-private:
- static char ID;
- TargetMachine &TM;
-
- void addLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
- unsigned physReg, unsigned virtReg);
-
-public:
- SIAssignInterpRegsPass(TargetMachine &tm) :
- MachineFunctionPass(ID), TM(tm) { }
-
- virtual bool runOnMachineFunction(MachineFunction &MF);
-
- const char *getPassName() const { return "SI Assign intrpolation registers"; }
-};
-
-} // End anonymous namespace
-
-char SIAssignInterpRegsPass::ID = 0;
-
-#define INTERP_VALUES 16
-#define REQUIRED_VALUE_MAX_INDEX 7
-
-struct InterpInfo {
- bool Enabled;
- unsigned Regs[3];
- unsigned RegCount;
-};
-
-
-FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
- return new SIAssignInterpRegsPass(tm);
-}
-
-bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) {
-
- struct InterpInfo InterpUse[INTERP_VALUES] = {
- {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2},
- {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2},
- {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2},
- {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3},
- {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2},
- {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2},
- {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2},
- {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1},
- {false, {AMDGPU::POS_X_FLOAT}, 1},
- {false, {AMDGPU::POS_Y_FLOAT}, 1},
- {false, {AMDGPU::POS_Z_FLOAT}, 1},
- {false, {AMDGPU::POS_W_FLOAT}, 1},
- {false, {AMDGPU::FRONT_FACE}, 1},
- {false, {AMDGPU::ANCILLARY}, 1},
- {false, {AMDGPU::SAMPLE_COVERAGE}, 1},
- {false, {AMDGPU::POS_FIXED_PT}, 1}
- };
-
- SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
- // This pass is only needed for pixel shaders.
- if (MFI->ShaderType != ShaderType::PIXEL) {
- return false;
- }
- MachineRegisterInfo &MRI = MF.getRegInfo();
- bool ForceEnable = true;
-
- // First pass, mark the interpolation values that are used.
- for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
- for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
- RegIdx++) {
- InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled ||
- !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]);
- if (InterpUse[InterpIdx].Enabled &&
- InterpIdx <= REQUIRED_VALUE_MAX_INDEX) {
- ForceEnable = false;
- }
- }
- }
-
- // At least one interpolation mode must be enabled or else the GPU will hang.
- if (ForceEnable) {
- InterpUse[0].Enabled = true;
- }
-
- unsigned UsedVgprs = 0;
-
- // Second pass, replace with VGPRs.
- for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
- if (!InterpUse[InterpIdx].Enabled) {
- continue;
- }
- MFI->SPIPSInputAddr |= (1 << InterpIdx);
-
- for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
- RegIdx++, UsedVgprs++) {
- unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs);
- unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg);
- addLiveIn(&MF, MRI, NewReg, VirtReg);
- }
- }
-
- return false;
-}
-
-void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF,
- MachineRegisterInfo & MRI,
- unsigned physReg, unsigned virtReg) {
- const TargetInstrInfo * TII = TM.getInstrInfo();
- if (!MRI.isLiveIn(physReg)) {
- MRI.addLiveIn(physReg, virtReg);
- MF->front().addLiveIn(physReg);
- BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
- TII->get(TargetOpcode::COPY), virtReg)
- .addReg(physReg);
- } else {
- MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
- }
-}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0a0fbd9..93f8c38 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,10 +14,13 @@
#include "SIISelLowering.h"
#include "AMDIL.h"
+#include "AMDGPU.h"
#include "AMDILIntrinsicInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -28,30 +31,41 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
AMDGPUTargetLowering(TM),
TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())),
TRI(TM.getRegisterInfo()) {
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
- addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+ addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
+ addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
+
+ addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
+ addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
+
addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
computeRegisterProperties();
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
setOperationAction(ISD::ADD, MVT::i64, Legal);
setOperationAction(ISD::ADD, MVT::i32, Legal);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-
- // We need to custom lower loads from the USER_SGPR address space, so we can
- // add the SGPRs as livein registers.
- setOperationAction(ISD::LOAD, MVT::i32, Custom);
- setOperationAction(ISD::LOAD, MVT::i64, Custom);
-
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -59,6 +73,137 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
+
+ setSchedulingPreference(Sched::Source);
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+ SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+
+ const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ FunctionType *FType = MF.getFunction()->getFunctionType();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ assert(CallConv == CallingConv::C);
+
+ SmallVector<ISD::InputArg, 16> Splits;
+ uint32_t Skipped = 0;
+
+ for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+ const ISD::InputArg &Arg = Ins[i];
+
+ // First check if it's a PS input addr
+ if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
+
+ assert((PSInputNum <= 15) && "Too many PS inputs!");
+
+ if (!Arg.Used) {
+ // We can savely skip PS inputs
+ Skipped |= 1 << i;
+ ++PSInputNum;
+ continue;
+ }
+
+ Info->PSInputAddr |= 1 << PSInputNum++;
+ }
+
+ // Second split vertices into their elements
+ if (Arg.VT.isVector()) {
+ ISD::InputArg NewArg = Arg;
+ NewArg.Flags.setSplit();
+ NewArg.VT = Arg.VT.getVectorElementType();
+
+ // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+ // three or five element vertex only needs three or five registers,
+ // NOT four or eigth.
+ Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+ unsigned NumElements = ParamType->getVectorNumElements();
+
+ for (unsigned j = 0; j != NumElements; ++j) {
+ Splits.push_back(NewArg);
+ NewArg.PartOffset += NewArg.VT.getStoreSize();
+ }
+
+ } else {
+ Splits.push_back(Arg);
+ }
+ }
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
+
+ // At least one interpolation mode must be enabled or else the GPU will hang.
+ if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) {
+ Info->PSInputAddr |= 1;
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ }
+
+ AnalyzeFormalArguments(CCInfo, Splits);
+
+ for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+ if (Skipped & (1 << i)) {
+ InVals.push_back(SDValue());
+ continue;
+ }
+
+ CCValAssign &VA = ArgLocs[ArgIdx++];
+ assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+ unsigned Reg = VA.getLocReg();
+ MVT VT = VA.getLocVT();
+
+ if (VT == MVT::i64) {
+ // For now assume it is a pointer
+ Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+ &AMDGPU::SReg_64RegClass);
+ Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
+ InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+ continue;
+ }
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+ Reg = MF.addLiveIn(Reg, RC);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+ const ISD::InputArg &Arg = Ins[i];
+ if (Arg.VT.isVector()) {
+
+ // Build a vector from the registers
+ Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+ unsigned NumElements = ParamType->getVectorNumElements();
+
+ SmallVector<SDValue, 4> Regs;
+ Regs.push_back(Val);
+ for (unsigned j = 1; j != NumElements; ++j) {
+ Reg = ArgLocs[ArgIdx++].getLocReg();
+ Reg = MF.addLiveIn(Reg, RC);
+ Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+ }
+
+ // Fill up the missing vector elements
+ NumElements = Arg.VT.getVectorNumElements() - NumElements;
+ for (unsigned j = 0; j != NumElements; ++j)
+ Regs.push_back(DAG.getUNDEF(VT));
+
+ InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
+ Regs.data(), Regs.size()));
+ continue;
+ }
+
+ InVals.push_back(Val);
+ }
+ return Chain;
}
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
@@ -70,15 +215,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH: return BB;
- case AMDGPU::SHADER_TYPE:
- BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
- MI->getOperand(0).getImm();
- MI->eraseFromParent();
- break;
-
- case AMDGPU::SI_INTERP:
- LowerSI_INTERP(MI, *BB, I, MRI);
- break;
case AMDGPU::SI_WQM:
LowerSI_WQM(MI, *BB, I, MRI);
break;
@@ -94,41 +230,14 @@ void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
MI->eraseFromParent();
}
-void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
- MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
- unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
- MachineOperand dst = MI->getOperand(0);
- MachineOperand iReg = MI->getOperand(1);
- MachineOperand jReg = MI->getOperand(2);
- MachineOperand attr_chan = MI->getOperand(3);
- MachineOperand attr = MI->getOperand(4);
- MachineOperand params = MI->getOperand(5);
-
- BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
- .addOperand(params);
-
- BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
- .addOperand(iReg)
- .addOperand(attr_chan)
- .addOperand(attr)
- .addReg(M0);
-
- BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
- .addOperand(dst)
- .addReg(tmp)
- .addOperand(jReg)
- .addOperand(attr_chan)
- .addOperand(attr)
- .addReg(M0);
-
- MI->eraseFromParent();
-}
-
EVT SITargetLowering::getSetCCResultType(EVT VT) const {
return MVT::i1;
}
+MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+ return MVT::i32;
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
@@ -137,20 +246,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
- case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
- case ISD::INTRINSIC_WO_CHAIN: {
- unsigned IntrinsicID =
- cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- EVT VT = Op.getValueType();
- switch (IntrinsicID) {
- case AMDGPUIntrinsic::SI_vs_load_buffer_index:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
- AMDGPU::VGPR0, VT);
- default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
- }
- break;
- }
}
return SDValue();
}
@@ -249,47 +345,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
-SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
-
- assert(Ptr);
-
- unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
-
- // We only need to lower USER_SGPR address space loads
- if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
- return SDValue();
- }
-
- // Loads from the USER_SGPR address space can only have constant value
- // pointers.
- ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
- assert(BasePtr);
-
- unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
- const TargetRegisterClass * dstClass;
- switch (TypeDwordWidth) {
- default:
- assert(!"USER_SGPR value size not implemented");
- return SDValue();
- case 1:
- dstClass = &AMDGPU::SReg_32RegClass;
- break;
- case 2:
- dstClass = &AMDGPU::SReg_64RegClass;
- break;
- }
- uint64_t Index = BasePtr->getZExtValue();
- assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
- unsigned SGPRIndex = Index / TypeDwordWidth;
- unsigned Reg = dstClass->getRegister(SGPRIndex);
-
- DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
- VT));
- return SDValue();
-}
-
SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 737162f..d656225 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -24,14 +24,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
const SIInstrInfo * TII;
const TargetRegisterInfo * TRI;
- void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
- MachineBasicBlock::iterator I, unsigned Opocde) const;
- void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
- MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
- SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -43,9 +38,17 @@ class SITargetLowering : public AMDGPUTargetLowering {
public:
SITargetLowering(TargetMachine &tm);
+
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
MachineBasicBlock * BB) const;
virtual EVT getSetCCResultType(EVT VT) const;
+ virtual MVT getScalarShiftAmountTy(EVT VT) const;
virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 24fc929..98bd3db 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -88,6 +88,9 @@ private:
MachineBasicBlock::iterator I,
const Counters &Counts);
+ /// \brief Do we need def2def checks?
+ bool unorderedDefines(MachineInstr &MI);
+
/// \brief Resolve all operand dependencies to counter requirements
Counters handleOperands(MachineInstr &MI);
@@ -125,7 +128,7 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
// Only consider stores or EXP for EXP_CNT
Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
- (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
+ (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
// LGKM may uses larger values
if (TSFlags & SIInstrFlags::LGKM_CNT) {
@@ -311,8 +314,10 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
RegInterval Interval = getRegInterval(Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {
- if (Op.isDef())
+ if (Op.isDef()) {
increaseCounters(Result, UsedRegs[j]);
+ increaseCounters(Result, DefinedRegs[j]);
+ }
if (Op.isUse())
increaseCounters(Result, DefinedRegs[j]);
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index fe417d6..3891ddb 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -129,12 +129,12 @@ class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
bits<7> SDST;
- bits<6> SBASE;
+ bits<7> SBASE;
bits<8> OFFSET;
let Inst{7-0} = OFFSET;
let Inst{8} = imm;
- let Inst{14-9} = SBASE;
+ let Inst{14-9} = SBASE{6-1};
let Inst{21-15} = SDST;
let Inst{26-22} = op;
let Inst{31-27} = 0x18; //encoding
@@ -292,7 +292,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
bits<1> ADDR64;
bits<1> LDS;
bits<8> VADDR;
- bits<5> SRSRC;
+ bits<7> SRSRC;
bits<1> SLC;
bits<1> TFE;
bits<8> SOFFSET;
@@ -307,7 +307,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = VADDR;
let Inst{47-40} = VDATA;
- let Inst{52-48} = SRSRC;
+ let Inst{52-48} = SRSRC{6-2};
let Inst{54} = SLC;
let Inst{55} = TFE;
let Inst{63-56} = SOFFSET;
@@ -330,7 +330,7 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
bits<4> DFMT;
bits<3> NFMT;
bits<8> VADDR;
- bits<5> SRSRC;
+ bits<7> SRSRC;
bits<1> SLC;
bits<1> TFE;
bits<8> SOFFSET;
@@ -346,7 +346,7 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = VADDR;
let Inst{47-40} = VDATA;
- let Inst{52-48} = SRSRC;
+ let Inst{52-48} = SRSRC{6-2};
let Inst{54} = SLC;
let Inst{55} = TFE;
let Inst{63-56} = SOFFSET;
@@ -370,8 +370,8 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
bits<1> LWE;
bits<1> SLC;
bits<8> VADDR;
- bits<5> SRSRC;
- bits<5> SSAMP;
+ bits<7> SRSRC;
+ bits<7> SSAMP;
let Inst{11-8} = DMASK;
let Inst{12} = UNORM;
@@ -385,8 +385,8 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{31-26} = 0x3c;
let Inst{39-32} = VADDR;
let Inst{47-40} = VDATA;
- let Inst{52-48} = SRSRC;
- let Inst{57-53} = SSAMP;
+ let Inst{52-48} = SRSRC{6-2};
+ let Inst{57-53} = SSAMP{6-2};
let VM_CNT = 1;
let EXP_CNT = 1;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index d9dbd6a..de2373b 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -33,36 +33,108 @@ const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
void
SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const {
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const {
+
// If we are trying to copy to or from SCC, there is a bug somewhere else in
// the backend. While it may be theoretically possible to do this, it should
// never be necessary.
assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
- if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
- AMDGPU::SReg_64RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0))
- .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc))
- .addReg(DestReg, RegState::Define | RegState::Implicit);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1))
- .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc));
+ const int16_t Sub0_15[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+ };
+
+ const int16_t Sub0_7[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+ };
+
+ const int16_t Sub0_3[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+ };
+
+ const int16_t Sub0_1[] = {
+ AMDGPU::sub0, AMDGPU::sub1, 0
+ };
+
+ unsigned Opcode;
+ const int16_t *SubIndices;
+
+ if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+
} else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+
+ } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
+ assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
+ Opcode = AMDGPU::S_MOV_B32;
+ SubIndices = Sub0_3;
+
+ } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
+ assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
+ Opcode = AMDGPU::S_MOV_B32;
+ SubIndices = Sub0_7;
+
+ } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
+ assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
+ Opcode = AMDGPU::S_MOV_B32;
+ SubIndices = Sub0_15;
+
} else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
- AMDGPU::SReg_32RegClass.contains(SrcReg));
+ AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+
+ } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
+ assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_64RegClass.contains(SrcReg));
+ Opcode = AMDGPU::V_MOV_B32_e32;
+ SubIndices = Sub0_1;
+
+ } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
+ assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_128RegClass.contains(SrcReg));
+ Opcode = AMDGPU::V_MOV_B32_e32;
+ SubIndices = Sub0_3;
+
+ } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
+ assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_256RegClass.contains(SrcReg));
+ Opcode = AMDGPU::V_MOV_B32_e32;
+ SubIndices = Sub0_7;
+
+ } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
+ assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_512RegClass.contains(SrcReg));
+ Opcode = AMDGPU::V_MOV_B32_e32;
+ SubIndices = Sub0_15;
+
} else {
- assert(AMDGPU::SReg_32RegClass.contains(DestReg));
- assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ llvm_unreachable("Can't copy register!");
+ }
+
+ while (unsigned SubIdx = *SubIndices++) {
+ MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+ get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+ Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+
+ if (*SubIndices)
+ Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
}
}
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index d6c3f06..2f10c38 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -53,16 +53,6 @@ def SIOperand {
int VCC = 0x6A;
}
-class GPR4Align <RegisterClass rc> : Operand <vAny> {
- let EncoderMethod = "GPR4AlignEncode";
- let MIOperandInfo = (ops rc:$reg);
-}
-
-class GPR2Align <RegisterClass rc> : Operand <iPTR> {
- let EncoderMethod = "GPR2AlignEncode";
- let MIOperandInfo = (ops rc:$reg);
-}
-
include "SIInstrFormats.td"
//===----------------------------------------------------------------------===//
@@ -125,16 +115,17 @@ class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
opName#" $dst, $src0", pattern
>;
-multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
+ RegisterClass dstClass> {
def _IMM : SMRD <
op, 1, (outs dstClass:$dst),
- (ins GPR2Align<SReg_64>:$sbase, i32imm:$offset),
+ (ins baseClass:$sbase, i32imm:$offset),
asm#" $dst, $sbase, $offset", []
>;
def _SGPR : SMRD <
op, 0, (outs dstClass:$dst),
- (ins GPR2Align<SReg_64>:$sbase, SReg_32:$soff),
+ (ins baseClass:$sbase, SReg_32:$soff),
asm#" $dst, $sbase, $soff", []
>;
}
@@ -276,7 +267,7 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
(outs),
(ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
- GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+ SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
#" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
[]> {
@@ -288,7 +279,7 @@ class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF
op,
(outs regClass:$dst),
(ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
+ i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
i1imm:$tfe, SSrc_32:$soffset),
asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, "
#"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
@@ -301,7 +292,7 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
op,
(outs regClass:$dst),
(ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
+ i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
#" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
@@ -315,7 +306,7 @@ class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
(outs VReg_128:$vdata),
(ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr,
- GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
+ SReg_256:$srsrc, SReg_128:$ssamp),
asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
#" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
[]> {
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index af116f0..05b04a9 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -403,9 +403,9 @@ def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT
//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
-//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
-//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
-//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
+def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
+def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
+def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
@@ -458,17 +458,31 @@ def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORM
let mayLoad = 1 in {
-defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
-//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
-//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
-//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
-//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
-//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
-//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
-//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+ 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+ 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+ 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+ 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+ 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
} // mayLoad = 1
@@ -840,7 +854,9 @@ defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+ [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))]
+>;
defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
let isCommutable = 1 in {
@@ -1044,13 +1060,6 @@ def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
let isCodeGenOnly = 1, isPseudo = 1 in {
-def SET_M0 : InstSI <
- (outs SReg_32:$dst),
- (ins i32imm:$src0),
- "SET_M0 $dst, $src0",
- [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
->;
-
def LOAD_CONST : AMDGPUShaderInst <
(outs GPRF32:$dst),
(ins i32imm:$src),
@@ -1060,13 +1069,6 @@ def LOAD_CONST : AMDGPUShaderInst <
let usesCustomInserter = 1 in {
-def SI_INTERP : InstSI <
- (outs VReg_32:$dst),
- (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
- "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
- []
->;
-
def SI_WQM : InstSI <
(outs),
(ins),
@@ -1147,6 +1149,31 @@ def SI_KILL : InstSI <
} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
// Uses = [EXEC], Defs = [EXEC]
+let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+
+def SI_INDIRECT_SRC : InstSI <
+ (outs VReg_32:$dst, SReg_64:$temp),
+ (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+ "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
+ []
+>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
+ (outs rc:$dst, SReg_64:$temp),
+ (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
+ "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
+ []
+> {
+ let Constraints = "$src = $dst";
+}
+
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+
} // end IsCodeGenOnly, isPseudo
def : Pat<
@@ -1255,22 +1282,83 @@ defm : SamplePatterns<VReg_128, v4i32>;
defm : SamplePatterns<VReg_256, v8i32>;
defm : SamplePatterns<VReg_512, v16i32>;
-def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>;
-def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>;
-def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>;
-def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>;
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+ def Extract_Element_v2i32_#Index : Extract_Element <
+ i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v2i32_#Index : Insert_Element <
+ i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v2f32_#Index : Extract_Element <
+ f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v2f32_#Index : Insert_Element <
+ f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-3 in {
+ def Extract_Element_v4i32_#Index : Extract_Element <
+ i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v4i32_#Index : Insert_Element <
+ i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ >;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>;
+ def Extract_Element_v4f32_#Index : Extract_Element <
+ f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v4f32_#Index : Insert_Element <
+ f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-7 in {
+ def Extract_Element_v8i32_#Index : Extract_Element <
+ i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v8i32_#Index : Insert_Element <
+ i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v8f32_#Index : Extract_Element <
+ f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v8f32_#Index : Insert_Element <
+ f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-15 in {
+ def Extract_Element_v16i32_#Index : Extract_Element <
+ i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v16i32_#Index : Insert_Element <
+ i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v16f32_#Index : Extract_Element <
+ f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v16f32_#Index : Insert_Element <
+ f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
-def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
-def : Vector_Build <v4i32, VReg_128, i32, VReg_32>;
+def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>;
+def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>;
+def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>;
def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
+def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>;
def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
+def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>;
def : BitConvert <i32, f32, SReg_32>;
def : BitConvert <i32, f32, VReg_32>;
@@ -1305,11 +1393,6 @@ def : Pat <
/********** ================== **********/
def : Pat <
- (i1 imm:$imm),
- (S_MOV_B64 imm:$imm)
->;
-
-def : Pat <
(i32 imm:$imm),
(V_MOV_B32_e32 imm:$imm)
>;
@@ -1320,13 +1403,8 @@ def : Pat <
>;
def : Pat <
- (i32 imm:$imm),
- (S_MOV_B32 imm:$imm)
->;
-
-def : Pat <
- (f32 fpimm:$imm),
- (S_MOV_B32 fpimm:$imm)
+ (i1 imm:$imm),
+ (S_MOV_B64 imm:$imm)
>;
def : Pat <
@@ -1347,58 +1425,16 @@ def : Pat <
/********** ===================== **********/
def : Pat <
- (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params),
- (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr,
- (S_MOV_B32 SReg_32:$params))
->;
-
-def : Pat <
- (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
- (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
- imm:$attr, SReg_32:$params)
+ (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params),
+ (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params)
>;
def : Pat <
- (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
- (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan,
- imm:$attr, SReg_32:$params)
->;
-
-def : Pat <
- (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params),
- (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan,
- imm:$attr, SReg_32:$params)
->;
-
-def : Pat <
- (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
- (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan,
- imm:$attr, SReg_32:$params)
->;
-
-def : Pat <
- (int_SI_fs_read_face),
- (f32 FRONT_FACE)
->;
-
-def : Pat <
- (int_SI_fs_read_pos 0),
- (f32 POS_X_FLOAT)
->;
-
-def : Pat <
- (int_SI_fs_read_pos 1),
- (f32 POS_Y_FLOAT)
->;
-
-def : Pat <
- (int_SI_fs_read_pos 2),
- (f32 POS_Z_FLOAT)
->;
-
-def : Pat <
- (int_SI_fs_read_pos 3),
- (f32 POS_W_FLOAT)
+ (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij),
+ (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0),
+ imm:$attr_chan, imm:$attr, M0Reg:$params),
+ (EXTRACT_SUBREG VReg_64:$ij, sub1),
+ imm:$attr_chan, imm:$attr, M0Reg:$params)
>;
/********** ================== **********/
@@ -1455,6 +1491,24 @@ def : Pat <
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
>;
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+ (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset)
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+ (int_SI_load_const SReg_128:$sbase, imm:$offset),
+ (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset))
+>;
+
+// 3. Offset in an 32Bit VGPR
+def : Pat <
+ (int_SI_load_const SReg_128:$sbase, VReg_32:$voff),
+ (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0)
+>;
+
/********** ================== **********/
/********** VOP3 Patterns **********/
/********** ================== **********/
@@ -1489,7 +1543,51 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+
+/********** ====================== **********/
+/********** Indirect adressing **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
+ SI_INDIRECT_DST IndDst> {
+ // 1. Extract with offset
+ def : Pat<
+ (vector_extract (vt rc:$vec),
+ (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
+ ),
+ (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
+ >;
+
+ // 2. Extract without offset
+ def : Pat<
+ (vector_extract (vt rc:$vec),
+ (i64 (zext (i32 VReg_32:$idx)))
+ ),
+ (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
+ >;
+
+ // 3. Insert with offset
+ def : Pat<
+ (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
+ (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
+ ),
+ (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
+ >;
+
+ // 4. Insert without offset
+ def : Pat<
+ (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
+ (i64 (zext (i32 VReg_32:$idx)))
+ ),
+ (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
+ >;
+}
+
+defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
} // End isSI predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 611b9c4..33bb815 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -16,13 +16,11 @@ let TargetPrefix = "SI", isTarget = 1 in {
def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
- /* XXX: We may need a seperate intrinsic here for loading integer values */
- def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
- def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
- def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
+ def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>;
+ def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
def int_SI_wqm : Intrinsic <[], [], []>;
- class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>;
+ class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>;
def int_SI_sample : Sample;
def int_SI_sampleb : Sample;
@@ -30,17 +28,8 @@ let TargetPrefix = "SI", isTarget = 1 in {
/* Interpolation Intrinsics */
- def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
- class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
-
- def int_SI_fs_interp_linear_center : Interp;
- def int_SI_fs_interp_linear_centroid : Interp;
- def int_SI_fs_interp_persp_center : Interp;
- def int_SI_fs_interp_persp_centroid : Interp;
- def int_SI_fs_interp_constant : Interp;
-
- def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
- def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
+ def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrReadMem]>;
/* Control flow Intrinsics */
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index b215aa2..9a027e7 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -66,6 +66,7 @@ private:
static const unsigned SkipThreshold = 12;
static char ID;
+ const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -84,9 +85,14 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
+ void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
+ void IndirectSrc(MachineInstr &MI);
+ void IndirectDst(MachineInstr &MI);
+
public:
SILowerControlFlowPass(TargetMachine &tm) :
- MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
+ MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
+ TII(tm.getInstrInfo()) { }
virtual bool runOnMachineFunction(MachineFunction &MF);
@@ -302,6 +308,104 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MI.eraseFromParent();
}
+void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I = MI;
+
+ unsigned Save = MI.getOperand(1).getReg();
+ unsigned Idx = MI.getOperand(3).getReg();
+
+ if (AMDGPU::SReg_32RegClass.contains(Idx)) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(Idx);
+ MBB.insert(I, MovRel);
+ MI.eraseFromParent();
+ return;
+ }
+
+ assert(AMDGPU::SReg_64RegClass.contains(Save));
+ assert(AMDGPU::VReg_32RegClass.contains(Idx));
+
+ // Save the EXEC mask
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+ .addReg(AMDGPU::EXEC);
+
+ // Read the next variant into VCC (lower 32 bits) <- also loop target
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
+ .addReg(Idx);
+
+ // Move index from VCC into M0
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(AMDGPU::VCC);
+
+ // Compare the just read M0 value to all possible Idx values
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+ .addReg(AMDGPU::M0)
+ .addReg(Idx);
+
+ // Update EXEC, save the original EXEC value to VCC
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+ .addReg(AMDGPU::VCC);
+
+ // Do the actual move
+ MBB.insert(I, MovRel);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addImm(-7)
+ .addReg(AMDGPU::EXEC);
+
+ // Restore EXEC
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(Save);
+
+ MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Vec = MI.getOperand(2).getReg();
+ unsigned Off = MI.getOperand(4).getImm();
+
+ MachineInstr *MovRel =
+ BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
+ .addReg(AMDGPU::M0, RegState::Implicit)
+ .addReg(Vec, RegState::Implicit);
+
+ LoadM0(MI, MovRel);
+}
+
+void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned Off = MI.getOperand(4).getImm();
+ unsigned Val = MI.getOperand(5).getReg();
+
+ MachineInstr *MovRel =
+ BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
+ .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
+ .addReg(Val)
+ .addReg(AMDGPU::M0, RegState::Implicit)
+ .addReg(Dst, RegState::Implicit);
+
+ LoadM0(MI, MovRel);
+}
+
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
bool HaveKill = false;
@@ -363,6 +467,17 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::S_BRANCH:
Branch(MI);
break;
+
+ case AMDGPU::SI_INDIRECT_SRC:
+ IndirectSrc(MI);
+ break;
+
+ case AMDGPU::SI_INDIRECT_DST_V2:
+ case AMDGPU::SI_INDIRECT_DST_V4:
+ case AMDGPU::SI_INDIRECT_DST_V8:
+ case AMDGPU::SI_INDIRECT_DST_V16:
+ IndirectDst(MI);
+ break;
}
}
}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index 7e59b42..1a4e4cb 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,11 +10,25 @@
#include "SIMachineFunctionInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
using namespace llvm;
+const char *SIMachineFunctionInfo::ShaderTypeAttribute = "ShaderType";
+
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: MachineFunctionInfo(),
- SPIPSInputAddr(0),
- ShaderType(0)
- { }
+ ShaderType(0),
+ PSInputAddr(0) {
+
+ AttributeSet Set = MF.getFunction()->getAttributes();
+ Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
+ ShaderTypeAttribute);
+
+ if (A.isStringAttribute()) {
+ StringRef Str = A.getValueAsString();
+ if (Str.getAsInteger(0, ShaderType))
+ llvm_unreachable("Can't parse shader type!");
+ }
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 47271f5..91a809b 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -23,9 +23,11 @@ namespace llvm {
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public MachineFunctionInfo {
public:
+ static const char *ShaderTypeAttribute;
+
SIMachineFunctionInfo(const MachineFunction &MF);
- unsigned SPIPSInputAddr;
unsigned ShaderType;
+ unsigned PSInputAddr;
};
} // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 9e04e24..4f14931 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -34,32 +34,6 @@ foreach Index = 0-255 in {
}
}
-// virtual Interpolation registers
-def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
-def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
-def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
-def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
-def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
-def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
-def PERSP_I_W : SIReg <"PERSP_I_W">;
-def PERSP_J_W : SIReg <"PERSP_J_W">;
-def PERSP_1_W : SIReg <"PERSP_1_W">;
-def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
-def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
-def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
-def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
-def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
-def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
-def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
-def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
-def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
-def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
-def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
-def FRONT_FACE : SIReg <"FRONT_FACE">;
-def ANCILLARY : SIReg <"ANCILLARY">;
-def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
-def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
-
//===----------------------------------------------------------------------===//
// Groupings using register classes and tuples
//===----------------------------------------------------------------------===//
@@ -177,22 +151,22 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
(add SGPR_64, VCCReg, EXECReg)
>;
-def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>;
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32, (add VGPR_32)>;
+def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
-def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>;
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
-def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
//===----------------------------------------------------------------------===//
// [SV]Src_* register classes, can have either an immediate or an register
@@ -200,28 +174,9 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
-def SSrc_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SReg_64)>;
-
-def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
- (add VReg_32, SReg_32,
- PERSP_SAMPLE_I, PERSP_SAMPLE_J,
- PERSP_CENTER_I, PERSP_CENTER_J,
- PERSP_CENTROID_I, PERSP_CENTROID_J,
- PERSP_I_W, PERSP_J_W, PERSP_1_W,
- LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
- LINEAR_CENTER_I, LINEAR_CENTER_J,
- LINEAR_CENTROID_I, LINEAR_CENTROID_J,
- LINE_STIPPLE_TEX_COORD,
- POS_X_FLOAT,
- POS_Y_FLOAT,
- POS_Z_FLOAT,
- POS_W_FLOAT,
- FRONT_FACE,
- ANCILLARY,
- SAMPLE_COVERAGE,
- POS_FIXED_PT
- )
->;
+def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+
+def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
-def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add VReg_64, SReg_64)>;
+def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;