aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-05-29 02:49:00 -0700
committerStephen Hines <srhines@google.com>2014-05-29 02:49:00 -0700
commitdce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
treedcebc53f2b182f145a2e659393bf9a0472cedf23 /lib/Target/R600
parent220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
downloadexternal_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2
Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPU.h11
-rw-r--r--lib/Target/R600/AMDGPU.td11
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp25
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.h13
-rw-r--r--lib/Target/R600/AMDGPUCallingConv.td2
-rw-r--r--lib/Target/R600/AMDGPUConvertToISA.cpp4
-rw-r--r--lib/Target/R600/AMDGPUFrameLowering.cpp2
-rw-r--r--lib/Target/R600/AMDGPUFrameLowering.h13
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp291
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp744
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h91
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.cpp22
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.h73
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.td15
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td49
-rw-r--r--lib/Target/R600/AMDGPUIntrinsics.td4
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.cpp28
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.h16
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.cpp8
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.h14
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp13
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h15
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp23
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.h27
-rw-r--r--lib/Target/R600/AMDGPUTargetTransformInfo.cpp26
-rw-r--r--lib/Target/R600/AMDILCFGStructurizer.cpp58
-rw-r--r--lib/Target/R600/AMDILISelLowering.cpp92
-rw-r--r--lib/Target/R600/AMDILIntrinsicInfo.cpp2
-rw-r--r--lib/Target/R600/AMDILIntrinsicInfo.h12
-rw-r--r--lib/Target/R600/AMDILIntrinsics.td4
-rw-r--r--lib/Target/R600/CMakeLists.txt1
-rw-r--r--lib/Target/R600/CaymanInstructions.td7
-rw-r--r--lib/Target/R600/EvergreenInstructions.td18
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp118
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h13
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp26
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp4
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h2
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp4
-rw-r--r--lib/Target/R600/MCTargetDesc/LLVMBuild.txt4
-rw-r--r--lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp10
-rw-r--r--lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp10
-rw-r--r--lib/Target/R600/Processors.td2
-rw-r--r--lib/Target/R600/R600ClauseMergePass.cpp7
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp15
-rw-r--r--lib/Target/R600/R600EmitClauseMarkers.cpp6
-rw-r--r--lib/Target/R600/R600ExpandSpecialInstrs.cpp6
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp150
-rw-r--r--lib/Target/R600/R600ISelLowering.h32
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp8
-rw-r--r--lib/Target/R600/R600InstrInfo.h76
-rw-r--r--lib/Target/R600/R600Instructions.td6
-rw-r--r--lib/Target/R600/R600MachineFunctionInfo.h2
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp14
-rw-r--r--lib/Target/R600/R600MachineScheduler.h15
-rw-r--r--lib/Target/R600/R600OptimizeVectorRegisters.cpp11
-rw-r--r--lib/Target/R600/R600Packetizer.cpp24
-rw-r--r--lib/Target/R600/R600RegisterInfo.h15
-rw-r--r--lib/Target/R600/R600TextureIntrinsicsReplacer.cpp8
-rw-r--r--lib/Target/R600/SIAnnotateControlFlow.cpp28
-rw-r--r--lib/Target/R600/SIFixSGPRCopies.cpp23
-rw-r--r--lib/Target/R600/SIISelLowering.cpp307
-rw-r--r--lib/Target/R600/SIISelLowering.h36
-rw-r--r--lib/Target/R600/SIInsertWaits.cpp8
-rw-r--r--lib/Target/R600/SIInstrFormats.td23
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp379
-rw-r--r--lib/Target/R600/SIInstrInfo.h66
-rw-r--r--lib/Target/R600/SIInstrInfo.td146
-rw-r--r--lib/Target/R600/SIInstructions.td1109
-rw-r--r--lib/Target/R600/SILowerControlFlow.cpp10
-rw-r--r--lib/Target/R600/SILowerI1Copies.cpp148
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.cpp57
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.h9
-rw-r--r--lib/Target/R600/SIRegisterInfo.cpp18
-rw-r--r--lib/Target/R600/SIRegisterInfo.h20
-rw-r--r--lib/Target/R600/SIRegisterInfo.td14
-rw-r--r--lib/Target/R600/SITypeRewriter.cpp24
77 files changed, 3148 insertions, 1599 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 3e1848b..949fdfb 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -37,11 +37,15 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
// SI Passes
FunctionPass *createSITypeRewriter();
FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
// Passes common to R600 and SI
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
@@ -76,8 +80,8 @@ enum AddressSpaces {
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
CONSTANT_ADDRESS = 2, ///< Address space for constant memory
LOCAL_ADDRESS = 3, ///< Address space for local memory.
- REGION_ADDRESS = 4, ///< Address space for region memory.
- ADDRESS_NONE = 5, ///< Address space for unknown memory.
+ FLAT_ADDRESS = 4, ///< Address space for flat memory.
+ REGION_ADDRESS = 5, ///< Address space for region memory.
PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
@@ -102,7 +106,8 @@ enum AddressSpaces {
CONSTANT_BUFFER_13 = 21,
CONSTANT_BUFFER_14 = 22,
CONSTANT_BUFFER_15 = 23,
- LAST_ADDRESS = 24
+ ADDRESS_NONE = 24, ///< Address space for unknown memory.
+ LAST_ADDRESS = ADDRESS_NONE
};
} // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index d1e2cf5..2edc115 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -120,6 +120,17 @@ def AMDGPU : Target {
let InstructionSet = AMDGPUInstrInfo;
}
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+class PredicateControl {
+ Predicate SubtargetPredicate;
+ list<Predicate> OtherPredicates = [];
+ list<Predicate> Predicates = !listconcat([SubtargetPredicate],
+ OtherPredicates);
+}
+
// Include AMDGPU TD files
include "R600Schedule.td"
include "SISchedule.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index b166c45..170f479 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -64,7 +64,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
- findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
+ getSIProgramInfo(KernelInfo, MF);
EmitProgramInfoSI(MF, KernelInfo);
} else {
EmitProgramInfoR600(MF);
@@ -84,8 +84,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SectionKind::getReadOnly());
OutStreamer.SwitchSection(CommentSection);
- if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
OutStreamer.emitRawComment(" Kernel info:", false);
+ OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+ false);
OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
false);
OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
@@ -184,9 +186,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
}
}
-void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
- unsigned &NumSGPR,
- unsigned &NumVGPR) const {
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+ MachineFunction &MF) const {
+ uint64_t CodeSize = 0;
unsigned MaxSGPR = 0;
unsigned MaxVGPR = 0;
bool VCCUsed = false;
@@ -200,6 +202,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
I != E; ++I) {
MachineInstr &MI = *I;
+ // TODO: CodeSize should account for multiple functions.
+ CodeSize += MI.getDesc().Size;
+
unsigned numOperands = MI.getNumOperands();
for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
MachineOperand &MO = MI.getOperand(op_idx);
@@ -274,13 +279,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
if (VCCUsed)
MaxSGPR += 2;
- NumSGPR = MaxSGPR;
- NumVGPR = MaxVGPR;
-}
-
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
- MachineFunction &MF) const {
- findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
+ ProgInfo.CodeLen = CodeSize;
+ ProgInfo.NumSGPR = MaxSGPR;
+ ProgInfo.NumVGPR = MaxVGPR;
}
void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index a2b8337..71adc9a 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,7 +24,12 @@ namespace llvm {
class AMDGPUAsmPrinter : public AsmPrinter {
private:
struct SIProgramInfo {
- SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
+ SIProgramInfo() :
+ CodeLen(0),
+ NumSGPR(0),
+ NumVGPR(0) {}
+
+ uint64_t CodeLen;
unsigned NumSGPR;
unsigned NumVGPR;
};
@@ -42,14 +47,14 @@ private:
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- virtual const char *getPassName() const {
+ const char *getPassName() const override {
return "AMDGPU Assembly Printer";
}
/// Implemented in AMDGPUMCInstLower.cpp
- virtual void EmitInstruction(const MachineInstr *MI);
+ void EmitInstruction(const MachineInstr *MI) override;
protected:
bool DisasmEnabled;
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 65cdb24..5f8ad8c 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -20,7 +20,7 @@ def CC_SI : CallingConv<[
CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
- SGPR16
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
]>>>,
CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
index 50297d1..91aeee2 100644
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ b/lib/Target/R600/AMDGPUConvertToISA.cpp
@@ -31,9 +31,9 @@ public:
AMDGPUConvertToISAPass(TargetMachine &tm) :
MachineFunctionPass(ID), TM(tm) { }
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+ const char *getPassName() const override {return "AMDGPU Convert to ISA";}
};
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 0325a00..e7e90d3 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -97,7 +97,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
const TargetFrameLowering::SpillSlot *
AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
NumEntries = 0;
- return 0;
+ return nullptr;
}
void
AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index cf5742e..d18ede5 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
@@ -33,12 +33,13 @@ public:
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
- virtual unsigned getStackWidth(const MachineFunction &MF) const;
- virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
- virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
- virtual void emitPrologue(MachineFunction &MF) const;
- virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
- virtual bool hasFP(const MachineFunction &MF) const;
+ unsigned getStackWidth(const MachineFunction &MF) const;
+ int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ const SpillSlot *
+ getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+ void emitPrologue(MachineFunction &MF) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ bool hasFP(const MachineFunction &MF) const override;
};
} // namespace llvm
#endif // AMDILFRAME_LOWERING_H
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index e8c5f5b..f1f0bfa 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -16,15 +16,11 @@
#include "AMDGPURegisterInfo.h"
#include "R600InstrInfo.h"
#include "SIISelLowering.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Compiler.h"
-#include <list>
-#include <queue>
+#include "llvm/IR/Function.h"
using namespace llvm;
@@ -43,11 +39,12 @@ public:
AMDGPUDAGToDAGISel(TargetMachine &TM);
virtual ~AMDGPUDAGToDAGISel();
- SDNode *Select(SDNode *N);
- virtual const char *getPassName() const;
- virtual void PostprocessISelDAG();
+ SDNode *Select(SDNode *N) override;
+ const char *getPassName() const override;
+ void PostprocessISelDAG() override;
private:
+ bool isInlineImmediate(SDNode *N) const;
inline SDValue getSmallIPtrImm(unsigned Imm);
bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
const R600InstrInfo *TII);
@@ -58,11 +55,9 @@ private:
bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
- SDValue SimplifyI24(SDValue &Op);
- bool SelectI24(SDValue Addr, SDValue &Op);
- bool SelectU24(SDValue Addr, SDValue &Op);
static bool checkType(const Value *ptr, unsigned int addrspace);
+ static bool checkPrivateAddress(const MachineMemOperand *Op);
static bool isGlobalStore(const StoreSDNode *N);
static bool isPrivateStore(const StoreSDNode *N);
@@ -77,10 +72,15 @@ private:
bool isLocalLoad(const LoadSDNode *N) const;
bool isRegionLoad(const LoadSDNode *N) const;
+ /// \returns True if the current basic block being selected is at control
+ /// flow depth 0. Meaning that the current block dominates the
+ // exit block.
+ bool isCFDepth0() const;
+
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
- bool SelectGlobalValueVariableOffset(SDValue Addr,
- SDValue &BaseReg, SDValue& Offset);
+ bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+ SDValue& Offset);
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -91,8 +91,7 @@ private:
/// \brief This pass converts a legalized DAG into a AMDGPU-specific
// DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
- ) {
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
return new AMDGPUDAGToDAGISel(TM);
}
@@ -103,32 +102,39 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
}
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+ const SITargetLowering *TL
+ = static_cast<const SITargetLowering *>(getTargetLowering());
+ return TL->analyzeImmediate(N) == 0;
+}
+
/// \brief Determine the register class for \p OpNo
/// \returns The register class of the virtual register that will be used for
/// the given operand number \OpNo or NULL if the register class cannot be
/// determined.
const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
unsigned OpNo) const {
- if (!N->isMachineOpcode()) {
- return NULL;
- }
+ if (!N->isMachineOpcode())
+ return nullptr;
+
switch (N->getMachineOpcode()) {
default: {
const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
unsigned OpIdx = Desc.getNumDefs() + OpNo;
if (OpIdx >= Desc.getNumOperands())
- return NULL;
+ return nullptr;
int RegClass = Desc.OpInfo[OpIdx].RegClass;
- if (RegClass == -1) {
- return NULL;
- }
+ if (RegClass == -1)
+ return nullptr;
+
return TM.getRegisterInfo()->getRegClass(RegClass);
}
case AMDGPU::REG_SEQUENCE: {
- const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
- cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
- unsigned SubRegIdx =
- dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+ unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+
+ SDValue SubRegOp = N->getOperand(OpNo + 1);
+ unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
}
}
@@ -139,7 +145,7 @@ SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
}
bool AMDGPUDAGToDAGISel::SelectADDRParam(
- SDValue Addr, SDValue& R1, SDValue& R2) {
+ SDValue Addr, SDValue& R1, SDValue& R2) {
if (Addr.getOpcode() == ISD::FrameIndex) {
if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
@@ -196,15 +202,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
N->setNodeId(-1);
- return NULL; // Already selected.
+ return nullptr; // Already selected.
}
+
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
switch (Opc) {
default: break;
// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
case ISD::ADD: {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (N->getValueType(0) != MVT::i64 ||
ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
@@ -232,12 +239,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
AddLoArgs.push_back(SDValue(Lo0, 0));
AddLoArgs.push_back(SDValue(Lo1, 0));
- SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL,
- VTList, AddLoArgs);
+ SDNode *AddLo = CurDAG->getMachineNode(
+ isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32,
+ DL, VTList, AddLoArgs);
SDValue Carry = SDValue(AddLo, 1);
- SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL,
- MVT::i32, SDValue(Hi0, 0),
- SDValue(Hi1, 0), Carry);
+ SDNode *AddHi = CurDAG->getMachineNode(
+ isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32,
+ DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
SDValue Args[5] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
@@ -246,11 +254,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
SDValue(AddHi,0),
Sub1,
};
- return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
+ return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
}
case ISD::BUILD_VECTOR: {
unsigned RegClassID;
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
const AMDGPURegisterInfo *TRI =
static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
const SIRegisterInfo *SIRI =
@@ -316,7 +323,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
// 16 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
- SDValue RegSeqArgs[16 * 2 + 1];
+ SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
bool IsRegSeq = true;
@@ -333,11 +340,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (!IsRegSeq)
break;
return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
- RegSeqArgs, 2 * N->getNumOperands() + 1);
+ RegSeqArgs);
}
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
break;
}
@@ -346,7 +352,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
} else if (N->getValueType(0) == MVT::i64) {
- RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
} else {
@@ -357,8 +363,37 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
SDLoc(N), N->getValueType(0), Ops);
}
- case AMDGPUISD::REGISTER_LOAD: {
+
+ case ISD::Constant:
+ case ISD::ConstantFP: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+ if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+ break;
+
+ uint64_t Imm;
+ if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+ Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+ else {
+ ConstantSDNode *C = cast<ConstantSDNode>(N);
+ Imm = C->getZExtValue();
+ }
+
+ SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+ CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32));
+ SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+ CurDAG->getConstant(Imm >> 32, MVT::i32));
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+ SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+ SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+ };
+
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N),
+ N->getValueType(0), Ops);
+ }
+
+ case AMDGPUISD::REGISTER_LOAD: {
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
@@ -375,7 +410,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
Ops);
}
case AMDGPUISD::REGISTER_STORE: {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
@@ -391,42 +425,95 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
CurDAG->getVTList(MVT::Other),
Ops);
}
+
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ // There is a scalar version available, but unlike the vector version which
+ // has a separate operand for the offset and width, the scalar version packs
+ // the width and offset into a single operand. Try to move to the scalar
+ // version if the offsets are constant, so that we can try to keep extended
+ // loads of kernel arguments in SGPRs.
+
+ // TODO: Technically we could try to pattern match scalar bitshifts of
+ // dynamic values, but it's probably not useful.
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Offset)
+ break;
+
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!Width)
+ break;
+
+ bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+ // Transformation function, pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+ uint32_t OffsetVal = Offset->getZExtValue();
+ uint32_t WidthVal = Width->getZExtValue();
+
+ uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+ SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+ return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+ SDLoc(N),
+ MVT::i32,
+ N->getOperand(0),
+ PackedOffsetWidth);
+
+ }
}
return SelectCode(N);
}
-bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
- if (!ptr) {
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+ assert(AS != 0 && "Use checkPrivateAddress instead.");
+ if (!Ptr)
return false;
- }
- Type *ptrType = ptr->getType();
- return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+
+ return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+ if (Op->getPseudoValue())
+ return true;
+
+ if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+ return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+ return false;
}
bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
- return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
- return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+ const Value *MemVal = N->getMemOperand()->getValue();
+ return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
}
bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
- return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
- return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
- if (CbId == -1) {
- return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS);
- }
- return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+ const Value *MemVal = N->getMemOperand()->getValue();
+ if (CbId == -1)
+ return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+ return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
}
bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
@@ -437,27 +524,26 @@ bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
return true;
}
}
- return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
- return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const {
- return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const {
- return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
}
bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
MachineMemOperand *MMO = N->getMemOperand();
- if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+ if (checkPrivateAddress(N->getMemOperand())) {
if (MMO) {
- const Value *V = MMO->getValue();
- const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+ const PseudoSourceValue *PSV = MMO->getPseudoValue();
if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
return true;
}
@@ -467,24 +553,34 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
}
bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
- if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+ if (checkPrivateAddress(N->getMemOperand())) {
// Check to make sure we are not a constant pool load or a constant load
// that is marked as a private load
if (isCPLoad(N) || isConstantLoad(N, -1)) {
return false;
}
}
- if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
- && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+
+ const Value *MemVal = N->getMemOperand()->getValue();
+ if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
return true;
}
return false;
}
+bool AMDGPUDAGToDAGISel::isCFDepth0() const {
+ // FIXME: Figure out a way to use DominatorTree analysis here.
+ const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
+ const Function *Fn = FuncInfo->Fn;
+ return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
+}
+
+
const char *AMDGPUDAGToDAGISel::getPassName() const {
return "AMDGPU DAG->DAG Pattern Instruction Selection";
}
@@ -499,7 +595,7 @@ const char *AMDGPUDAGToDAGISel::getPassName() const {
//===----------------------------------------------------------------------===//
bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
- SDValue& IntPtr) {
+ SDValue& IntPtr) {
if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
return true;
@@ -509,7 +605,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
SDValue& BaseReg, SDValue &Offset) {
- if (!dyn_cast<ConstantSDNode>(Addr)) {
+ if (!isa<ConstantSDNode>(Addr)) {
BaseReg = Addr;
Offset = CurDAG->getIntPtrConstant(0, true);
return true;
@@ -519,7 +615,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
SDValue &Offset) {
- ConstantSDNode * IMMOffset;
+ ConstantSDNode *IMMOffset;
if (Addr.getOpcode() == ISD::ADD
&& (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
@@ -563,52 +659,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
return true;
}
-SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
- APInt Demanded = APInt(32, 0x00FFFFFF);
- APInt KnownZero, KnownOne;
- TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
- const TargetLowering *TLI = getTargetLowering();
- if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
- CurDAG->ReplaceAllUsesWith(Op, TLO.New);
- CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
- return SimplifyI24(TLO.New);
- } else {
- return Op;
- }
-}
-
-bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
-
- assert(Op.getValueType() == MVT::i32);
-
- if (CurDAG->ComputeNumSignBits(Op) == 9) {
- I24 = SimplifyI24(Op);
- return true;
- }
- return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
- APInt KnownZero;
- APInt KnownOne;
- CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
-
- assert (Op.getValueType() == MVT::i32);
-
- // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
- // i32. These smaller types are legal to use with the i24 instructions.
- if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
- Op.getOpcode() == ISD::ANY_EXTEND ||
- ISD::isEXTLoad(Op.getNode())) {
- U24 = SimplifyI24(Op);
- return true;
- }
- return false;
-}
-
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
- (*(const AMDGPUTargetLowering*)getTargetLowering());
+ *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
bool IsModified = false;
do {
IsModified = false;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 183725c..6c443ea 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -28,8 +28,50 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
using namespace llvm;
+
+namespace {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+ const Twine &Description;
+ const Function &Fn;
+
+ static int KindID;
+
+ static int getKindID() {
+ if (KindID == 0)
+ KindID = llvm::getNextAvailablePluginDiagnosticKind();
+ return KindID;
+ }
+
+public:
+ DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+ DiagnosticSeverity Severity = DS_Error)
+ : DiagnosticInfo(getKindID(), Severity),
+ Description(Desc),
+ Fn(Fn) { }
+
+ const Function &getFunction() const { return Fn; }
+ const Twine &getDescription() const { return Description; }
+
+ void print(DiagnosticPrinter &DP) const override {
+ DP << "unsupported " << getDescription() << " in " << Fn.getName();
+ }
+
+ static bool classof(const DiagnosticInfo *DI) {
+ return DI->getKind() == getKindID();
+ }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+
static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -88,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::f64, Promote);
AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+ setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+
// Custom lowering of vector stores is required for local address space
// stores.
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
@@ -103,6 +148,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
// handle 64-bit stores.
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
@@ -126,6 +173,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+ setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
@@ -152,15 +202,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::MUL, MVT::i64, Expand);
+ setOperationAction(ISD::SUB, MVT::i64, Expand);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
@@ -168,10 +222,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
static const MVT::SimpleValueType IntTypes[] = {
MVT::v2i32, MVT::v4i32
};
- const size_t NumIntTypes = array_lengthof(IntTypes);
- for (unsigned int x = 0; x < NumIntTypes; ++x) {
- MVT::SimpleValueType VT = IntTypes[x];
+ for (MVT VT : IntTypes) {
//Expand the following operations for the current type by default
setOperationAction(ISD::ADD, VT, Expand);
setOperationAction(ISD::AND, VT, Expand);
@@ -195,12 +247,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
static const MVT::SimpleValueType FloatTypes[] = {
MVT::v2f32, MVT::v4f32
};
- const size_t NumFloatTypes = array_lengthof(FloatTypes);
- for (unsigned int x = 0; x < NumFloatTypes; ++x) {
- MVT::SimpleValueType VT = FloatTypes[x];
+ for (MVT VT : FloatTypes) {
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
@@ -208,25 +259,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FMUL, VT, Expand);
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSUB, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
}
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SELECT_CC);
}
//===----------------------------------------------------------------------===//
@@ -325,6 +364,25 @@ SDValue AMDGPUTargetLowering::LowerReturn(
// Target specific lowering
//===---------------------------------------------------------------------===//
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SDValue Callee = CLI.Callee;
+ SelectionDAG &DAG = CLI.DAG;
+
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+ StringRef FuncName("<unknown>");
+
+ if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+ FuncName = G->getSymbol();
+ else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ FuncName = G->getGlobal()->getName();
+
+ DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+ DAG.getContext()->diagnose(NoCalls);
+ return SDValue();
+}
+
SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
const {
switch (Op.getOpcode()) {
@@ -361,12 +419,111 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
// nothing here and let the illegal result integer be handled normally.
return;
+ case ISD::UDIV: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+ N->getOperand(0), N->getOperand(1));
+ Results.push_back(UDIVREM);
+ break;
+ }
+ case ISD::UREM: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+ N->getOperand(0), N->getOperand(1));
+ Results.push_back(UDIVREM.getValue(1));
+ break;
+ }
+ case ISD::UDIVREM: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+ SDValue one = DAG.getConstant(1, HalfVT);
+ SDValue zero = DAG.getConstant(0, HalfVT);
+
+ //HiLo split
+ SDValue LHS = N->getOperand(0);
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+ SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+ SDValue RHS = N->getOperand(1);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+ SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+ // Get Speculative values
+ SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+ SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+ SDValue REM_Hi = zero;
+ SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+ SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+ SDValue DIV_Lo = zero;
+
+ const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+ for (unsigned i = 0; i < halfBitWidth; ++i) {
+ SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+ // Get Value of high bit
+ SDValue HBit;
+ if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+ HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+ } else {
+ HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+ HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+ }
+
+ SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+ DAG.getConstant(halfBitWidth - 1, HalfVT));
+ REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+ REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+ REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+ REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+ SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+ SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+ DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+ // Update REM
+
+ SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+ REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+ REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+ REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+ }
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+ SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+ Results.push_back(DIV);
+ Results.push_back(REM);
+ break;
+ }
default:
return;
}
}
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
const GlobalValue *GV,
const SDValue &InitPtr,
@@ -380,29 +537,60 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
TD->getPrefTypeAlignment(CI->getType()));
- } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+ }
+
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
EVT VT = EVT::getEVT(CFP->getType());
PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
TD->getPrefTypeAlignment(CFP->getType()));
- } else if (Init->getType()->isAggregateType()) {
+ }
+
+ Type *InitTy = Init->getType();
+ if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+ const StructLayout *SL = TD->getStructLayout(ST);
+
EVT PtrVT = InitPtr.getValueType();
- unsigned NumElements = Init->getType()->getArrayNumElements();
+ SmallVector<SDValue, 8> Chains;
+
+ for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+ SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+ Constant *Elt = Init->getAggregateElement(I);
+ Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ }
+
+ if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
+ EVT PtrVT = InitPtr.getValueType();
+
+ unsigned NumElements;
+ if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+ NumElements = AT->getNumElements();
+ else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+ NumElements = VT->getNumElements();
+ else
+ llvm_unreachable("Unexpected type");
+
+ unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
SmallVector<SDValue, 8> Chains;
for (unsigned i = 0; i < NumElements; ++i) {
- SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
- Init->getType()->getArrayElementType()), PtrVT);
+ SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
- Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
- GV, Ptr, Chain, DAG));
+
+ Constant *Elt = Init->getAggregateElement(i);
+ Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
}
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
- Chains.size());
- } else {
- Init->dump();
- llvm_unreachable("Unhandled constant initializer");
+
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
+
+ Init->dump();
+ llvm_unreachable("Unhandled constant initializer");
}
SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -440,7 +628,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
unsigned Size = TD->getTypeAllocSize(EltType);
unsigned Alignment = TD->getPrefTypeAlignment(EltType);
- const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+ const GlobalVariable *Var = cast<GlobalVariable>(GV);
const Constant *Init = Var->getInitializer();
int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
SDValue InitPtr = DAG.getFrameIndex(FI,
@@ -461,7 +649,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
Ops.push_back((*I)->getOperand(i));
}
- DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size());
+ DAG.UpdateNodeOperands(*I, Ops);
}
return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
@@ -469,44 +657,28 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
}
}
-void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Args,
- unsigned Start,
- unsigned Count) const {
- EVT VT = Op.getValueType();
- for (unsigned i = Start, e = Start + Count; i != e; ++i) {
- Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
- VT.getVectorElementType(),
- Op, DAG.getConstant(i, MVT::i32)));
- }
-}
-
SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
- ExtractVectorElements(A, DAG, Args, 0,
- A.getValueType().getVectorNumElements());
- ExtractVectorElements(B, DAG, Args, 0,
- B.getValueType().getVectorNumElements());
+ DAG.ExtractVectorElements(A, Args);
+ DAG.ExtractVectorElements(B, Args);
- return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
- &Args[0], Args.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
}
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
- EVT VT = Op.getValueType();
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
- VT.getVectorNumElements());
+ EVT VT = Op.getValueType();
+ DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+ VT.getVectorNumElements());
- return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
- &Args[0], Args.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
}
SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
@@ -560,6 +732,22 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
Op.getOperand(2));
+ case AMDGPUIntrinsic::AMDGPU_umul24:
+ return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
+ case AMDGPUIntrinsic::AMDGPU_imul24:
+ return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
+ case AMDGPUIntrinsic::AMDGPU_umad24:
+ return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case AMDGPUIntrinsic::AMDGPU_imad24:
+ return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
case AMDGPUIntrinsic::AMDGPU_bfe_i32:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1),
@@ -590,8 +778,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
///IABS(a) = SMAX(sub(0, a), a)
SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
- SelectionDAG &DAG) const {
-
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
@@ -603,7 +790,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
/// Linear Interpolation
/// LRP(a, b, c) = muladd(a, b, (1 - a) * c)
SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
@@ -617,16 +804,16 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
}
/// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
+SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
+ SelectionDAG &DAG) const {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue True = Op.getOperand(2);
- SDValue False = Op.getOperand(3);
- SDValue CC = Op.getOperand(4);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue True = N->getOperand(2);
+ SDValue False = N->getOperand(3);
+ SDValue CC = N->getOperand(4);
if (VT != MVT::f32 ||
!((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
@@ -654,10 +841,8 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
case ISD::SETOLT:
case ISD::SETLE:
case ISD::SETLT: {
- if (LHS == True)
- return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
- else
- return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+ unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
}
case ISD::SETGT:
case ISD::SETGE:
@@ -665,15 +850,13 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
case ISD::SETOGE:
case ISD::SETUGT:
case ISD::SETOGT: {
- if (LHS == True)
- return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
- else
- return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+ unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
}
case ISD::SETCC_INVALID:
llvm_unreachable("Invalid setcc condcode!");
}
- return Op;
+ return SDValue();
}
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
@@ -695,8 +878,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
Load->getAlignment()));
}
- return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(),
- Loads.data(), Loads.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
}
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@@ -713,32 +895,46 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
}
SDLoc DL(Op);
- const SDValue &Value = Store->getValue();
+ SDValue Value = Store->getValue();
EVT VT = Value.getValueType();
- const SDValue &Ptr = Store->getBasePtr();
+ EVT ElemVT = VT.getVectorElementType();
+ SDValue Ptr = Store->getBasePtr();
EVT MemEltVT = MemVT.getVectorElementType();
unsigned MemEltBits = MemEltVT.getSizeInBits();
unsigned MemNumElements = MemVT.getVectorNumElements();
- EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
- SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT);
+ unsigned PackedSize = MemVT.getStoreSizeInBits();
+ SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
+
+ assert(Value.getValueType().getScalarSizeInBits() >= 32);
SDValue PackedValue;
for (unsigned i = 0; i < MemNumElements; ++i) {
- EVT ElemVT = VT.getVectorElementType();
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
DAG.getConstant(i, MVT::i32));
- Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
- Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
- SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
- Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+ Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
+ Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
+
+ SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
+ Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
+
if (i == 0) {
PackedValue = Elt;
} else {
- PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+ PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
}
}
+
+ if (PackedSize < 32) {
+ EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
+ return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
+ Store->getMemOperand()->getPointerInfo(),
+ PackedVT,
+ Store->isNonTemporal(), Store->isVolatile(),
+ Store->getAlignment());
+ }
+
return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
- MachinePointerInfo(Store->getMemOperand()->getValue()),
+ Store->getMemOperand()->getPointerInfo(),
Store->isVolatile(), Store->isNonTemporal(),
Store->getAlignment());
}
@@ -766,7 +962,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
Store->getAlignment()));
}
- return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+ return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
}
SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -788,9 +984,24 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
}
+ if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
+ assert(VT == MVT::i1 && "Only i1 non-extloads expected");
+ // FIXME: Copied from PPC
+ // First, load into 32 bits, then truncate to 1 bit.
+
+ SDValue Chain = Load->getChain();
+ SDValue BasePtr = Load->getBasePtr();
+ MachineMemOperand *MMO = Load->getMemOperand();
+
+ SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+ BasePtr, MVT::i8, MMO);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+ }
+
// Lower loads constant address space global variable loads
if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
- isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) {
+ isa<GlobalVariable>(
+ GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
@@ -887,15 +1098,13 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Num = Op.getOperand(0);
SDValue Den = Op.getOperand(1);
- SmallVector<SDValue, 8> Results;
-
// RCP = URECIP(Den) = 2^32 / Den + e
// e is rounding error.
SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -985,10 +1194,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
// Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
Remainder_A_Den, Rem, ISD::SETEQ);
- SDValue Ops[2];
- Ops[0] = Div;
- Ops[1] = Rem;
- return DAG.getMergeValues(Ops, 2, DL);
+ SDValue Ops[2] = {
+ Div,
+ Rem
+ };
+ return DAG.getMergeValues(Ops, DL);
}
SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -1029,81 +1239,197 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
- unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits();
- unsigned DestBits = ScalarVT.getSizeInBits();
- unsigned BitsDiff = DestBits - SrcBits;
-
- if (!Subtarget->hasBFE())
- return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+ if (!VT.isVector())
+ return SDValue();
SDValue Src = Op.getOperand(0);
- if (VT.isVector()) {
- SDLoc DL(Op);
- // Need to scalarize this, and revisit each of the scalars later.
- // TODO: Don't scalarize on Evergreen?
- unsigned NElts = VT.getVectorNumElements();
- SmallVector<SDValue, 8> Args;
- ExtractVectorElements(Src, DAG, Args, 0, NElts);
+ SDLoc DL(Op);
- SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
- for (unsigned I = 0; I < NElts; ++I)
- Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+ // TODO: Don't scalarize on Evergreen?
+ unsigned NElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> Args;
+ DAG.ExtractVectorElements(Src, Args, 0, NElts);
- return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size());
- }
+ SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+ for (unsigned I = 0; I < NElts; ++I)
+ Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
- if (SrcBits == 32) {
- SDLoc DL(Op);
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+}
- // If the source is 32-bits, this is really half of a 2-register pair, and
- // we need to discard the unused half of the pair.
- SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
- return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc);
- }
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
- unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1;
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+ APInt KnownZero, KnownOne;
+ EVT VT = Op.getValueType();
+ DAG.computeKnownBits(Op, KnownZero, KnownOne);
- // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
- // might not be worth the effort, and will need to expand to shifts when
- // fixing SGPR copies.
- if (SrcBits < 32 && DestBits <= 32) {
- SDLoc DL(Op);
- MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
-
- if (DestBits != 32)
- Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src);
-
- // FIXME: This should use TargetConstant, but that hits assertions for
- // Evergreen.
- SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT,
- Op.getOperand(0), // Operand
- DAG.getConstant(0, ExtVT), // Offset
- DAG.getConstant(SrcBits, ExtVT)); // Width
-
- // Truncate to the original type if necessary.
- if (ScalarVT == MVT::i32)
- return Ext;
- return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext);
- }
+ return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
- // For small types, extend to 32-bits first.
- if (SrcBits < 32) {
- SDLoc DL(Op);
- MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
- SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src);
- SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32,
- DL,
- ExtVT,
- TruncSrc, // Operand
- DAG.getConstant(0, ExtVT), // Offset
- DAG.getConstant(SrcBits, ExtVT)); // Width
+ // In order for this to be a signed 24-bit value, bit 23, must
+ // be a sign bit.
+ return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+ // as unsigned 24-bit values.
+ (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT = Op.getValueType();
+
+ APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+ if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+}
- return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32);
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+ uint32_t Offset, uint32_t Width) {
+ if (Width + Offset < 32) {
+ IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+ return DAG.getConstant(Result, MVT::i32);
}
- // For everything else, use the standard bitshift expansion.
- return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+ return DAG.getConstant(Src0 >> Offset, MVT::i32);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ switch(N->getOpcode()) {
+ default: break;
+ case ISD::MUL: {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mul;
+
+ // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
+ if (VT.isVector() || VT.getSizeInBits() > 32)
+ break;
+
+ if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+ Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+ } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+ Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+ } else {
+ break;
+ }
+
+ // We need to use sext even for MUL_U24, because MUL_U24 is used
+ // for signed multiply of 8 and 16-bit types.
+ SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
+
+ return Reg;
+ }
+ case AMDGPUISD::MUL_I24:
+ case AMDGPUISD::MUL_U24: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ simplifyI24(N0, DCI);
+ simplifyI24(N1, DCI);
+ return SDValue();
+ }
+ case ISD::SELECT_CC: {
+ return CombineMinMax(N, DAG);
+ }
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ assert(!N->getValueType(0).isVector() &&
+ "Vector handling of BFE not implemented");
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!Width)
+ break;
+
+ uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+ if (WidthVal == 0)
+ return DAG.getConstant(0, MVT::i32);
+
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Offset)
+ break;
+
+ SDValue BitsFrom = N->getOperand(0);
+ uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+ bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+ if (OffsetVal == 0) {
+ // This is already sign / zero extended, so try to fold away extra BFEs.
+ unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+ unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+ if (OpSignBits >= SignBits)
+ return BitsFrom;
+
+ EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+ if (Signed) {
+ // This is a sign_extend_inreg. Replace it to take advantage of existing
+ // DAG Combines. If not eliminated, we will match back to BFE during
+ // selection.
+
+ // TODO: The sext_inreg of extended types ends, although we can could
+ // handle them in a single BFE.
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+ DAG.getValueType(SmallVT));
+ }
+
+ return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+ }
+
+ if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ if (Signed) {
+ return constantFoldBFE<int32_t>(DAG,
+ Val->getSExtValue(),
+ OffsetVal,
+ WidthVal);
+ }
+
+ return constantFoldBFE<uint32_t>(DAG,
+ Val->getZExtValue(),
+ OffsetVal,
+ WidthVal);
+ }
+
+ APInt Demanded = APInt::getBitsSet(32,
+ OffsetVal,
+ OffsetVal + WidthVal);
+
+ if ((OffsetVal + WidthVal) >= 32) {
+ SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
+ return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+ BitsFrom, ShiftVal);
+ }
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+ TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+
+ break;
+ }
+ }
+ return SDValue();
}
//===----------------------------------------------------------------------===//
@@ -1181,7 +1507,7 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
- default: return 0;
+ default: return nullptr;
// AMDIL DAG nodes
NODE_NAME_CASE(CALL);
NODE_NAME_CASE(UMUL);
@@ -1202,6 +1528,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BFE_I32)
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
+ NODE_NAME_CASE(MUL_U24)
+ NODE_NAME_CASE(MUL_I24)
+ NODE_NAME_CASE(MAD_U24)
+ NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(EXPORT)
@@ -1219,22 +1549,22 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
}
}
-static void computeMaskedBitsForMinMax(const SDValue Op0,
- const SDValue Op1,
- APInt &KnownZero,
- APInt &KnownOne,
- const SelectionDAG &DAG,
- unsigned Depth) {
+static void computeKnownBitsForMinMax(const SDValue Op0,
+ const SDValue Op1,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) {
APInt Op0Zero, Op0One;
APInt Op1Zero, Op1One;
- DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
- DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
+ DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
+ DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
KnownZero = Op0Zero & Op1Zero;
KnownOne = Op0One & Op1One;
}
-void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
const SDValue Op,
APInt &KnownZero,
APInt &KnownOne,
@@ -1242,8 +1572,14 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
unsigned Depth) const {
KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+ APInt KnownZero2;
+ APInt KnownOne2;
unsigned Opc = Op.getOpcode();
+
switch (Opc) {
+ default:
+ break;
case ISD::INTRINSIC_WO_CHAIN: {
// FIXME: The intrinsic should just use the node.
switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
@@ -1251,8 +1587,8 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
case AMDGPUIntrinsic::AMDGPU_umax:
case AMDGPUIntrinsic::AMDGPU_imin:
case AMDGPUIntrinsic::AMDGPU_umin:
- computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
- KnownZero, KnownOne, DAG, Depth);
+ computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+ KnownZero, KnownOne, DAG, Depth);
break;
default:
break;
@@ -1264,10 +1600,62 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
case AMDGPUISD::UMAX:
case AMDGPUISD::SMIN:
case AMDGPUISD::UMIN:
- computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
- KnownZero, KnownOne, DAG, Depth);
+ computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
+ KnownZero, KnownOne, DAG, Depth);
break;
- default:
+
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CWidth)
+ return;
+
+ unsigned BitWidth = 32;
+ uint32_t Width = CWidth->getZExtValue() & 0x1f;
+ if (Width == 0) {
+ KnownZero = APInt::getAllOnesValue(BitWidth);
+ KnownOne = APInt::getNullValue(BitWidth);
+ return;
+ }
+
+ // FIXME: This could do a lot more. If offset is 0, should be the same as
+ // sign_extend_inreg implementation, but that involves duplicating it.
+ if (Opc == AMDGPUISD::BFE_I32)
+ KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+ else
+ KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
break;
}
+ }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ switch (Op.getOpcode()) {
+ case AMDGPUISD::BFE_I32: {
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!Width)
+ return 1;
+
+ unsigned SignBits = 32 - Width->getZExtValue() + 1;
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!Offset || !Offset->isNullValue())
+ return SignBits;
+
+ // TODO: Could probably figure something out with non-0 offsets.
+ unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ return std::max(SignBits, Op0SignBits);
+ }
+
+ case AMDGPUISD::BFE_U32: {
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+ }
+
+ default:
+ return 1;
+ }
}
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index a019616..d5d821d 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -29,9 +29,6 @@ protected:
const AMDGPUSubtarget *Subtarget;
private:
- void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Args,
- unsigned Start, unsigned Count) const;
SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
const SDValue &InitPtr,
SDValue Chain,
@@ -44,7 +41,7 @@ private:
/// of the same bitwidth.
SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
/// \brief Split a vector store into multiple scalar stores.
- /// \returns The resulting chain.
+ /// \returns The resulting chain.
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
@@ -83,62 +80,67 @@ protected:
public:
AMDGPUTargetLowering(TargetMachine &TM);
- virtual bool isFAbsFree(EVT VT) const override;
- virtual bool isFNegFree(EVT VT) const override;
- virtual bool isTruncateFree(EVT Src, EVT Dest) const override;
- virtual bool isTruncateFree(Type *Src, Type *Dest) const override;
-
- virtual bool isZExtFree(Type *Src, Type *Dest) const override;
- virtual bool isZExtFree(EVT Src, EVT Dest) const override;
-
- virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
-
- virtual MVT getVectorIdxTy() const override;
- virtual bool isLoadBitCastBeneficial(EVT, EVT) const override;
- virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- SDLoc DL, SelectionDAG &DAG) const;
- virtual SDValue LowerCall(CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const {
- CLI.Callee.dump();
- llvm_unreachable("Undefined function");
- }
+ bool isFAbsFree(EVT VT) const override;
+ bool isFNegFree(EVT VT) const override;
+ bool isTruncateFree(EVT Src, EVT Dest) const override;
+ bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+ bool isZExtFree(Type *Src, Type *Dest) const override;
+ bool isZExtFree(EVT Src, EVT Dest) const override;
+
+ bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
- virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
- virtual void ReplaceNodeResults(SDNode * N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const override;
+ MVT getVectorIdxTy() const override;
+ bool isLoadBitCastBeneficial(EVT, EVT) const override;
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc DL, SelectionDAG &DAG) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ void ReplaceNodeResults(SDNode * N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
- virtual const char* getTargetNodeName(unsigned Opcode) const;
+ SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+ const char* getTargetNodeName(unsigned Opcode) const override;
- virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {
+ virtual SDNode *PostISelFolding(MachineSDNode *N,
+ SelectionDAG &DAG) const {
return N;
}
/// \brief Determine which of the bits specified in \p Mask are known to be
/// either zero or one and return them in the \p KnownZero and \p KnownOne
/// bitsets.
- virtual void computeMaskedBitsForTargetNode(const SDValue Op,
- APInt &KnownZero,
- APInt &KnownOne,
- const SelectionDAG &DAG,
- unsigned Depth = 0) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ virtual unsigned ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
// Functions defined in AMDILISelLowering.cpp
public:
- virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &I, unsigned Intrinsic) const;
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &I, unsigned Intrinsic) const override;
/// We want to mark f32/f64 floating point values as legal.
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
/// We don't want to shrink f64/f32 constants.
- bool ShouldShrinkFPConstant(EVT VT) const;
+ bool ShouldShrinkFPConstant(EVT VT) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
private:
void InitAMDILLowering();
@@ -158,7 +160,6 @@ private:
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
};
namespace AMDGPUISD {
@@ -188,6 +189,10 @@ enum {
BFE_I32, // Extract range of bits with sign extension to 32-bits.
BFI, // (src0 & src1) | (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.
+ MUL_U24,
+ MUL_I24,
+ MAD_U24,
+ MAD_I24,
TEXTURE_FETCH,
EXPORT,
CONST_ADDRESS,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index e32dd9f..1c3361a 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -20,14 +20,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
#define GET_INSTRINFO_CTOR_DTOR
#define GET_INSTRINFO_NAMED_OPS
#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
-using namespace llvm;
-
-
// Pin the vtable to this file.
void AMDGPUInstrInfo::anchor() {}
@@ -85,7 +84,7 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const {
// TODO: Implement this function
- return NULL;
+ return nullptr;
}
bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
MachineBasicBlock &MBB) const {
@@ -176,7 +175,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
const SmallVectorImpl<unsigned> &Ops,
int FrameIndex) const {
// TODO: Implement this function
- return 0;
+ return nullptr;
}
MachineInstr*
AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -184,7 +183,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
const SmallVectorImpl<unsigned> &Ops,
MachineInstr *LoadMI) const {
// TODO: Implement this function
- return 0;
+ return nullptr;
}
bool
AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
@@ -356,3 +355,14 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
}
}
+
+// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+namespace llvm {
+namespace AMDGPU {
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+ return getMCOpcode(Opcode);
+}
+}
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 426910c..74baf6b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -52,14 +52,15 @@ public:
virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &DstReg, unsigned &SubIdx) const;
+ unsigned &DstReg, unsigned &SubIdx) const override;
- unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+ unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const override;
unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
- int &FrameIndex) const;
+ int &FrameIndex) const override;
bool hasLoadFromStackSlot(const MachineInstr *MI,
const MachineMemOperand *&MMO,
- int &FrameIndex) const;
+ int &FrameIndex) const override;
unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
int &FrameIndex) const;
@@ -70,7 +71,7 @@ public:
MachineInstr *
convertToThreeAddress(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,
- LiveVariables *LV) const;
+ LiveVariables *LV) const override;
virtual void copyPhysReg(MachineBasicBlock &MBB,
@@ -78,61 +79,62 @@ public:
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const = 0;
- virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+ bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
- virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill, int FrameIndex,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const;
- virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- unsigned DestReg, int FrameIndex,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const;
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
protected:
MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
- int FrameIndex) const;
+ int FrameIndex) const override;
MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
- MachineInstr *LoadMI) const;
+ MachineInstr *LoadMI) const override;
/// \returns the smallest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
- virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+ int getIndirectIndexBegin(const MachineFunction &MF) const;
/// \returns the largest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
- virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+ int getIndirectIndexEnd(const MachineFunction &MF) const;
public:
bool canFoldMemoryOperand(const MachineInstr *MI,
- const SmallVectorImpl<unsigned> &Ops) const;
+ const SmallVectorImpl<unsigned> &Ops) const override;
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
- unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
- SmallVectorImpl<MachineInstr *> &NewMIs) const;
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr *> &NewMIs) const override;
bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
- SmallVectorImpl<SDNode *> &NewNodes) const;
+ SmallVectorImpl<SDNode *> &NewNodes) const override;
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
- bool UnfoldLoad, bool UnfoldStore,
- unsigned *LoadRegIndex = 0) const;
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = nullptr) const override;
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
- unsigned NumLoads) const;
+ unsigned NumLoads) const override;
- bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+ bool
+ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
void insertNoop(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const;
- bool isPredicated(const MachineInstr *MI) const;
+ MachineBasicBlock::iterator MI) const override;
+ bool isPredicated(const MachineInstr *MI) const override;
bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
- const SmallVectorImpl<MachineOperand> &Pred2) const;
+ const SmallVectorImpl<MachineOperand> &Pred2) const override;
bool DefinesPredicate(MachineInstr *MI,
- std::vector<MachineOperand> &Pred) const;
- bool isPredicable(MachineInstr *MI) const;
- bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+ std::vector<MachineOperand> &Pred) const override;
+ bool isPredicable(MachineInstr *MI) const override;
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
// Helper functions that check the opcode for status information
bool isLoadInst(llvm::MachineInstr *MI) const;
@@ -186,8 +188,7 @@ public:
/// \brief Convert the AMDIL MachineInstr to a supported ISA
/// MachineInstr
- virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
- DebugLoc DL) const;
+ void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
/// \brief Build a MOV instruction.
virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 69d8059..f96dbb4 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -92,3 +92,18 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
+// performing the mulitply. The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+ [SDNPCommutative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+ [SDNPCommutative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+ []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+ []
+>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 505fc81..80bdf5b 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -37,6 +37,18 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+def u32imm : Operand<i32> {
+ let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+ let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+ let PrintMethod = "printU8ImmOperand";
+}
+
//===----------------------------------------------------------------------===//
// PatLeafs for floating-point comparisons
//===----------------------------------------------------------------------===//
@@ -253,9 +265,6 @@ def FP_ONE : PatLeaf <
[{return N->isExactlyValue(1.0);}]
>;
-def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
-def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
-
let isCodeGenOnly = 1, isPseudo = 1 in {
let usesCustomInserter = 1 in {
@@ -414,6 +423,40 @@ class UMUL24Pattern <Instruction UMUL24> : Pat <
>;
*/
+class IMad24Pat<Instruction Inst> : Pat <
+ (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+ (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+ (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+ (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+ def _expand_imad24 : Pat <
+ (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+ (AddInst (MulInst $src0, $src1), $src2)
+ >;
+
+ def _expand_imul24 : Pat <
+ (AMDGPUmul_i24 i32:$src0, i32:$src1),
+ (MulInst $src0, $src1)
+ >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+ def _expand_umad24 : Pat <
+ (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+ (AddInst (MulInst $src0, $src1), $src2)
+ >;
+
+ def _expand_umul24 : Pat <
+ (AMDGPUmul_u24 i32:$src0, i32:$src1),
+ (MulInst $src0, $src1)
+ >;
+}
+
include "R600Instructions.td"
include "R700Instructions.td"
include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index c6521d0..9ad5e72 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -49,6 +49,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 2c9909f..b759495 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUAsmPrinter.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Constants.h"
@@ -31,16 +32,30 @@
using namespace llvm;
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
- Ctx(ctx)
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
+ Ctx(ctx), ST(st)
{ }
+enum AMDGPUMCInstLower::SISubtarget
+AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
+ return AMDGPUMCInstLower::SI;
+}
+
+unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+
+ int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
+ AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+ if (MCOpcode == -1)
+ MCOpcode = MIOpcode;
+
+ return MCOpcode;
+}
+
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
- OutMI.setOpcode(MI->getOpcode());
- for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
+ OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
+ for (const MachineOperand &MO : MI->explicit_operands()) {
MCOperand MCOp;
switch (MO.getType()) {
default:
@@ -67,7 +82,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
}
void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
- AMDGPUMCInstLower MCInstLowering(OutContext);
+ AMDGPUMCInstLower MCInstLowering(OutContext,
+ MF->getTarget().getSubtarget<AMDGPUSubtarget>());
#ifdef _DEBUG
StringRef Err;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index d7d538e..2b7f1e3 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -13,16 +13,30 @@
namespace llvm {
+class AMDGPUSubtarget;
class MCInst;
class MCContext;
class MachineInstr;
class AMDGPUMCInstLower {
+ // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+ enum SISubtarget {
+ SI = 0
+ };
+
MCContext &Ctx;
+ const AMDGPUSubtarget &ST;
+
+ /// Convert a member of the AMDGPUSubtarget::Generation enum to the
+ /// SISubtarget enum.
+ enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
+
+ /// Get the MC opcode for this MachineInstr.
+ unsigned getMCOpcode(unsigned MIOpcode) const;
public:
- AMDGPUMCInstLower(MCContext &ctx);
+ AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
/// \brief Lower a MachineInstr to an MCInst
void lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 8fbec4e..19927fa 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -27,10 +27,10 @@ AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
// they are not supported at this time.
//===----------------------------------------------------------------------===//
-const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
-const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
- const {
+const MCPhysReg*
+AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return &CalleeSavedReg;
}
@@ -54,7 +54,7 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
AMDGPU::sub15
};
- assert (Channel < array_lengthof(SubRegs));
+ assert(Channel < array_lengthof(SubRegs));
return SubRegs[Channel];
}
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 688e1a0..a7cba0d 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -30,11 +30,11 @@ class TargetInstrInfo;
struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
TargetMachine &TM;
- static const uint16_t CalleeSavedReg;
+ static const MCPhysReg CalleeSavedReg;
AMDGPURegisterInfo(TargetMachine &tm);
- virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+ BitVector getReservedRegs(const MachineFunction &MF) const override {
assert(!"Unimplemented"); return BitVector();
}
@@ -43,11 +43,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
/// \returns The ISA reg class that is equivalent to \p RC.
virtual const TargetRegisterClass * getISARegClass(
const TargetRegisterClass * RC) const {
- assert(!"Unimplemented"); return NULL;
+ assert(!"Unimplemented"); return nullptr;
}
virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
- assert(!"Unimplemented"); return NULL;
+ assert(!"Unimplemented"); return nullptr;
}
virtual unsigned getHWRegIndex(unsigned Reg) const {
@@ -58,11 +58,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
unsigned getSubRegFromChannel(unsigned Channel) const;
- const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+ const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
- RegScavenger *RS) const;
- unsigned getFrameRegister(const MachineFunction &MF) const;
+ RegScavenger *RS) const override;
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
unsigned getIndirectSubReg(unsigned IndirectIndex) const;
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index e77ab5e..f3b9932 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -16,6 +16,8 @@
using namespace llvm;
+#define DEBUG_TYPE "amdgpu-subtarget"
+
#define GET_SUBTARGETINFO_ENUM
#define GET_SUBTARGETINFO_TARGET_DESC
#define GET_SUBTARGETINFO_CTOR
@@ -28,9 +30,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
// Default card
StringRef GPU = CPU;
Is64bit = false;
- DefaultSize[0] = 64;
- DefaultSize[1] = 1;
- DefaultSize[2] = 1;
HasVertexCache = false;
TexVTXClauseSize = 0;
Gen = AMDGPUSubtarget::R600;
@@ -106,14 +105,6 @@ bool
AMDGPUSubtarget::isTargetELF() const {
return false;
}
-size_t
-AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
- if (dim > 2) {
- return 1;
- } else {
- return DefaultSize[dim];
- }
-}
std::string
AMDGPUSubtarget::getDeviceName() const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 8874d14..1b041d6 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -38,7 +38,6 @@ public:
};
private:
- size_t DefaultSize[3];
std::string DevName;
bool Is64bit;
bool Is32on64bit;
@@ -60,7 +59,7 @@ public:
AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
- virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
bool is64bit() const;
bool hasVertexCache() const;
@@ -77,20 +76,28 @@ public:
return hasBFE();
}
+ bool hasMulU24() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasMulI24() const {
+ return (getGeneration() >= SOUTHERN_ISLANDS ||
+ hasCaymanISA());
+ }
+
bool IsIRStructurizerEnabled() const;
bool isIfCvtEnabled() const;
unsigned getWavefrontSize() const;
unsigned getStackEntrySize() const;
bool hasCFAluBug() const;
- virtual bool enableMachineScheduler() const {
+ bool enableMachineScheduler() const override {
return getGeneration() <= NORTHERN_ISLANDS;
}
// Helper functions to simplify if statements
bool isTargetELF() const;
std::string getDeviceName() const;
- virtual size_t getDefaultSize(uint32_t dim) const;
bool dumpCode() const { return DumpCode; }
bool r600ALUEncoding() const { return R600ALUInst; }
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index b11fce3..174fdca 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -42,7 +42,7 @@ extern "C" void LLVMInitializeR600Target() {
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
- return new ScheduleDAGMILive(C, new R600SchedStrategy());
+ return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
}
static MachineSchedRegistry
@@ -54,7 +54,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
if (ST.is64bit()) {
// 32-bit private, local, and region pointers. 64-bit global and constant.
- Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+ Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
}
Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
@@ -103,20 +103,20 @@ public:
return getTM<AMDGPUTargetMachine>();
}
- virtual ScheduleDAGInstrs *
- createMachineScheduler(MachineSchedContext *C) const {
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
return createR600MachineScheduler(C);
- return 0;
+ return nullptr;
}
- virtual bool addPreISel();
- virtual bool addInstSelector();
- virtual bool addPreRegAlloc();
- virtual bool addPostRegAlloc();
- virtual bool addPreSched2();
- virtual bool addPreEmitPass();
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ bool addPreRegAlloc() override;
+ bool addPostRegAlloc() override;
+ bool addPreSched2() override;
+ bool addPreEmitPass() override;
};
} // End of anonymous namespace
@@ -154,6 +154,7 @@ AMDGPUPassConfig::addPreISel() {
bool AMDGPUPassConfig::addInstSelector() {
addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+ addPass(createSILowerI1CopiesPass());
return false;
}
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index f942614..1287e13 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -20,7 +20,6 @@
#include "AMDGPUSubtarget.h"
#include "AMDILIntrinsicInfo.h"
#include "R600ISelLowering.h"
-#include "llvm/ADT/OwningPtr.h"
#include "llvm/IR/DataLayout.h"
namespace llvm {
@@ -31,8 +30,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
const DataLayout Layout;
AMDGPUFrameLowering FrameLowering;
AMDGPUIntrinsicInfo IntrinsicInfo;
- OwningPtr<AMDGPUInstrInfo> InstrInfo;
- OwningPtr<AMDGPUTargetLowering> TLInfo;
+ std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+ std::unique_ptr<AMDGPUTargetLowering> TLInfo;
const InstrItineraryData *InstrItins;
public:
@@ -40,30 +39,32 @@ public:
StringRef CPU, TargetOptions Options, Reloc::Model RM,
CodeModel::Model CM, CodeGenOpt::Level OL);
~AMDGPUTargetMachine();
- virtual const AMDGPUFrameLowering *getFrameLowering() const {
+ const AMDGPUFrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
- virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const {
+ const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
return &IntrinsicInfo;
}
- virtual const AMDGPUInstrInfo *getInstrInfo() const {
+ const AMDGPUInstrInfo *getInstrInfo() const override {
return InstrInfo.get();
}
- virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
- virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+ const AMDGPUSubtarget *getSubtargetImpl() const override {
+ return &Subtarget;
+ }
+ const AMDGPURegisterInfo *getRegisterInfo() const override {
return &InstrInfo->getRegisterInfo();
}
- virtual AMDGPUTargetLowering *getTargetLowering() const {
+ AMDGPUTargetLowering *getTargetLowering() const override {
return TLInfo.get();
}
- virtual const InstrItineraryData *getInstrItineraryData() const {
+ const InstrItineraryData *getInstrItineraryData() const override {
return InstrItins;
}
- virtual const DataLayout *getDataLayout() const { return &Layout; }
- virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+ const DataLayout *getDataLayout() const override { return &Layout; }
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
/// \brief Register R600 analysis passes with a pass manager.
- virtual void addAnalysisPasses(PassManagerBase &PM);
+ void addAnalysisPasses(PassManagerBase &PM) override;
};
} // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 51225eb..ea78f43 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -15,7 +15,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "AMDGPUtti"
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -26,6 +25,8 @@
#include "llvm/Target/TargetLowering.h"
using namespace llvm;
+#define DEBUG_TYPE "AMDGPUtti"
+
// Declare the pass initialization routine locally as target-specific passes
// don't have a target-wide initialization entry point, and so we rely on the
// pass constructor initialization.
@@ -45,7 +46,7 @@ class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
public:
- AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+ AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
}
@@ -55,9 +56,9 @@ public:
initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
}
- virtual void initializePass() override { pushTTIStack(this); }
+ void initializePass() override { pushTTIStack(this); }
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
TargetTransformInfo::getAnalysisUsage(AU);
}
@@ -65,15 +66,16 @@ public:
static char ID;
/// Provide necessary pointer adjustments for the two base classes.
- virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+ void *getAdjustedAnalysisPointer(const void *ID) override {
if (ID == &TargetTransformInfo::ID)
return (TargetTransformInfo *)this;
return this;
}
- virtual bool hasBranchDivergence() const override;
+ bool hasBranchDivergence() const override;
- virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+ void getUnrollingPreferences(Loop *L,
+ UnrollingPreferences &UP) const override;
/// @}
};
@@ -109,11 +111,11 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L,
// require us to use indirect addressing, which is slow and prone to
// compiler bugs. If this loop does an address calculation on an
// alloca ptr, then we want to use a higher than normal loop unroll
- // threshold. This will give SROA a better chance to eliminate these
- // allocas.
- //
- // Don't use the maximum allowed value here as it will make some
- // programs way too big.
+ // threshold. This will give SROA a better chance to eliminate these
+ // allocas.
+ //
+ // Don't use the maximum allowed value here as it will make some
+ // programs way too big.
UP.Threshold = 500;
}
}
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 21ca560..f3a0391 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -8,8 +8,6 @@
/// \file
//==-----------------------------------------------------------------------===//
-#define DEBUG_TYPE "structcfg"
-
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "R600InstrInfo.h"
@@ -34,6 +32,8 @@
using namespace llvm;
+#define DEBUG_TYPE "structcfg"
+
#define DEFAULT_VEC_SLOTS 8
// TODO: move-begin.
@@ -135,15 +135,15 @@ public:
static char ID;
AMDGPUCFGStructurizer() :
- MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+ MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
}
- const char *getPassName() const {
+ const char *getPassName() const override {
return "AMDGPU Control Flow Graph structurizer Pass";
}
- void getAnalysisUsage(AnalysisUsage &AU) const {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addPreserved<MachineFunctionAnalysis>();
AU.addRequired<MachineFunctionAnalysis>();
AU.addRequired<MachineDominatorTree>();
@@ -159,7 +159,7 @@ public:
/// sure all loops have an exit block
bool prepare();
- bool runOnMachineFunction(MachineFunction &MF) {
+ bool runOnMachineFunction(MachineFunction &MF) override {
TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
TRI = &TII->getRegisterInfo();
DEBUG(MF.dump(););
@@ -168,7 +168,7 @@ public:
MLI = &getAnalysis<MachineLoopInfo>();
DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
MDT = &getAnalysis<MachineDominatorTree>();
- DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+ DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
PDT = &getAnalysis<MachinePostDominatorTree>();
DEBUG(PDT->print(dbgs()););
prepare();
@@ -334,7 +334,7 @@ protected:
MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
void retireBlock(MachineBasicBlock *MBB);
- void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+ void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
/// This is work around solution for findNearestCommonDominator not avaiable
@@ -361,7 +361,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
const {
LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
if (It == LLInfoMap.end())
- return NULL;
+ return nullptr;
return (*It).second;
}
@@ -632,7 +632,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
MachineInstr *MI = &*It;
if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
return MI;
- return NULL;
+ return nullptr;
}
MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
@@ -648,7 +648,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
break;
}
}
- return NULL;
+ return nullptr;
}
MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
@@ -658,7 +658,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
if (instr->getOpcode() == AMDGPU::RETURN)
return instr;
}
- return NULL;
+ return nullptr;
}
MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
@@ -668,7 +668,7 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
if (MI->getOpcode() == AMDGPU::CONTINUE)
return MI;
}
- return NULL;
+ return nullptr;
}
bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
@@ -819,7 +819,7 @@ bool AMDGPUCFGStructurizer::run() {
SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
It;
- MachineBasicBlock *SccBeginMBB = NULL;
+ MachineBasicBlock *SccBeginMBB = nullptr;
int SccNumBlk = 0; // The number of active blocks, init to a
// maximum possible number.
int SccNumIter; // Number of iteration in this SCC.
@@ -874,7 +874,7 @@ bool AMDGPUCFGStructurizer::run() {
}
if (ContNextScc)
- SccBeginMBB = NULL;
+ SccBeginMBB = nullptr;
} //while, "one iteration" over the function.
MachineBasicBlock *EntryMBB =
@@ -933,7 +933,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
MachineBasicBlock *MBB;
for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
++It, ++SccNum) {
- std::vector<MachineBasicBlock *> &SccNext = *It;
+ const std::vector<MachineBasicBlock *> &SccNext = *It;
for (std::vector<MachineBasicBlock *>::const_iterator
blockIter = SccNext.begin(), blockEnd = SccNext.end();
blockIter != blockEnd; ++blockIter) {
@@ -1026,7 +1026,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
} else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
// Triangle pattern, false is empty
LandBlk = FalseMBB;
- FalseMBB = NULL;
+ FalseMBB = nullptr;
} else if (FalseMBB->succ_size() == 1
&& *FalseMBB->succ_begin() == TrueMBB) {
// Triangle pattern, true is empty
@@ -1034,7 +1034,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
std::swap(TrueMBB, FalseMBB);
reversePredicateSetter(MBB->end());
LandBlk = FalseMBB;
- FalseMBB = NULL;
+ FalseMBB = nullptr;
} else if (FalseMBB->succ_size() == 1
&& isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
LandBlk = *FalseMBB->succ_begin();
@@ -1075,13 +1075,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
int AMDGPUCFGStructurizer::loopendPatternMatch() {
std::vector<MachineLoop *> NestedLoops;
- for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
- It != E; ++It) {
- df_iterator<MachineLoop *> LpIt = df_begin(*It),
- LpE = df_end(*It);
- for (; LpIt != LpE; ++LpIt)
- NestedLoops.push_back(*LpIt);
- }
+ for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E;
+ ++It)
+ for (MachineLoop *ML : depth_first(*It))
+ NestedLoops.push_back(ML);
+
if (NestedLoops.size() == 0)
return 0;
@@ -1244,7 +1242,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
DEBUG(
dbgs() << " not working\n";
);
- DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
+ DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
} // walk down the postDomTree
return Num;
@@ -1723,11 +1721,11 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
if (!LoopHeader || !LoopLatch)
- return NULL;
+ return nullptr;
MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
// Is LoopRep an infinite loop ?
if (!BranchMI || !isUncondBranch(BranchMI))
- return NULL;
+ return nullptr;
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
@@ -1860,7 +1858,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
if (!Node1 || !Node2)
- return NULL;
+ return nullptr;
Node1 = Node1->getIDom();
while (Node1) {
@@ -1869,7 +1867,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
Node1 = Node1->getIDom();
}
- return NULL;
+ return nullptr;
}
MachineBasicBlock *
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 0761ff4..7cea803 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -39,61 +39,55 @@ using namespace llvm;
// TargetLowering Class Implementation Begins
//===----------------------------------------------------------------------===//
void AMDGPUTargetLowering::InitAMDILLowering() {
- static const int types[] = {
- (int)MVT::i8,
- (int)MVT::i16,
- (int)MVT::i32,
- (int)MVT::f32,
- (int)MVT::f64,
- (int)MVT::i64,
- (int)MVT::v2i8,
- (int)MVT::v4i8,
- (int)MVT::v2i16,
- (int)MVT::v4i16,
- (int)MVT::v4f32,
- (int)MVT::v4i32,
- (int)MVT::v2f32,
- (int)MVT::v2i32,
- (int)MVT::v2f64,
- (int)MVT::v2i64
+ static const MVT::SimpleValueType types[] = {
+ MVT::i8,
+ MVT::i16,
+ MVT::i32,
+ MVT::f32,
+ MVT::f64,
+ MVT::i64,
+ MVT::v2i8,
+ MVT::v4i8,
+ MVT::v2i16,
+ MVT::v4i16,
+ MVT::v4f32,
+ MVT::v4i32,
+ MVT::v2f32,
+ MVT::v2i32,
+ MVT::v2f64,
+ MVT::v2i64
};
- static const int IntTypes[] = {
- (int)MVT::i8,
- (int)MVT::i16,
- (int)MVT::i32,
- (int)MVT::i64
+ static const MVT::SimpleValueType IntTypes[] = {
+ MVT::i8,
+ MVT::i16,
+ MVT::i32,
+ MVT::i64
};
- static const int FloatTypes[] = {
- (int)MVT::f32,
- (int)MVT::f64
+ static const MVT::SimpleValueType FloatTypes[] = {
+ MVT::f32,
+ MVT::f64
};
- static const int VectorTypes[] = {
- (int)MVT::v2i8,
- (int)MVT::v4i8,
- (int)MVT::v2i16,
- (int)MVT::v4i16,
- (int)MVT::v4f32,
- (int)MVT::v4i32,
- (int)MVT::v2f32,
- (int)MVT::v2i32,
- (int)MVT::v2f64,
- (int)MVT::v2i64
+ static const MVT::SimpleValueType VectorTypes[] = {
+ MVT::v2i8,
+ MVT::v4i8,
+ MVT::v2i16,
+ MVT::v4i16,
+ MVT::v4f32,
+ MVT::v4i32,
+ MVT::v2f32,
+ MVT::v2i32,
+ MVT::v2f64,
+ MVT::v2i64
};
- const size_t NumTypes = array_lengthof(types);
- const size_t NumFloatTypes = array_lengthof(FloatTypes);
- const size_t NumIntTypes = array_lengthof(IntTypes);
- const size_t NumVectorTypes = array_lengthof(VectorTypes);
const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
// These are the current register classes that are
// supported
- for (unsigned int x = 0; x < NumTypes; ++x) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
-
+ for (MVT VT : types) {
setOperationAction(ISD::SUBE, VT, Expand);
setOperationAction(ISD::SUBC, VT, Expand);
setOperationAction(ISD::ADDE, VT, Expand);
@@ -109,9 +103,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
setOperationAction(ISD::SDIV, VT, Custom);
}
}
- for (unsigned int x = 0; x < NumFloatTypes; ++x) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
-
+ for (MVT VT : FloatTypes) {
// IL does not have these operations for floating point types
setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
setOperationAction(ISD::SETOLT, VT, Expand);
@@ -124,9 +116,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
setOperationAction(ISD::SETULE, VT, Expand);
}
- for (unsigned int x = 0; x < NumIntTypes; ++x) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
-
+ for (MVT VT : IntTypes) {
// GPU also does not have divrem function for signed or unsigned
setOperationAction(ISD::SDIVREM, VT, Expand);
@@ -142,9 +132,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
setOperationAction(ISD::CTLZ, VT, Expand);
}
- for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
-
+ for (MVT VT : VectorTypes) {
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
index 762ee39..fab4a3b 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -38,7 +38,7 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
};
if (IntrID < Intrinsic::num_intrinsics) {
- return 0;
+ return nullptr;
}
assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
&& "Invalid intrinsic ID");
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h
index 35559e2..924275a 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.h
+++ b/lib/Target/R600/AMDILIntrinsicInfo.h
@@ -34,13 +34,13 @@ enum ID {
class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
public:
AMDGPUIntrinsicInfo(TargetMachine *tm);
- std::string getName(unsigned int IntrId, Type **Tys = 0,
- unsigned int numTys = 0) const;
- unsigned int lookupName(const char *Name, unsigned int Len) const;
- bool isOverloaded(unsigned int IID) const;
+ std::string getName(unsigned int IntrId, Type **Tys = nullptr,
+ unsigned int numTys = 0) const override;
+ unsigned int lookupName(const char *Name, unsigned int Len) const override;
+ bool isOverloaded(unsigned int IID) const override;
Function *getDeclaration(Module *M, unsigned int ID,
- Type **Tys = 0,
- unsigned int numTys = 0) const;
+ Type **Tys = nullptr,
+ unsigned int numTys = 0) const override;
};
} // end namespace llvm
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
index 658deb5..4a3e02e 100644
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -92,10 +92,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
BinaryIntInt;
def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
BinaryIntInt;
- def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
- BinaryIntInt;
- def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
- BinaryIntInt;
def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
BinaryIntInt;
def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 93a5117..3c6fa5a 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_target(R600CodeGen
SIInstrInfo.cpp
SIISelLowering.cpp
SILowerControlFlow.cpp
+ SILowerI1Copies.cpp
SIMachineFunctionInfo.cpp
SIRegisterInfo.cpp
SITypeRewriter.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index acd7bde..2630345 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -21,12 +21,14 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
let Predicates = [isCayman] in {
def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
- [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
+ [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
>;
def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
- [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
+ [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
>;
+def : IMad24Pat<MULADD_INT24_cm>;
+
let isVector = 1 in {
def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
@@ -47,6 +49,7 @@ def COS_cm : COS_Common<0x8E>;
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
// RECIP_UINT emulation for Cayman
// The multiplication scales from [0,1] to the unsigned integer range
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 6430ca6..2065441 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -75,6 +75,8 @@ def COS_eg : COS_Common<0x8E>;
def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
//===----------------------------------------------------------------------===//
// Memory read/write instructions
//===----------------------------------------------------------------------===//
@@ -273,7 +275,7 @@ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
VecALU
>;
-def BFE_INT_eg : R600_3OP <0x4, "BFE_INT",
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
[(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
VecALU
>;
@@ -286,6 +288,13 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
VecALU
>;
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+ (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+ (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+ (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
defm : BFIPatterns <BFI_INT_eg>;
def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
@@ -294,8 +303,11 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
>;
def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
- [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
+ [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
>;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
@@ -309,7 +321,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>;
def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
- [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
+ [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
>;
def DOT4_eg : DOT4_Common<0xBE>;
defm CUBE_eg : CUBE_Common<0xC0>;
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 7105879..11ae091 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -12,6 +12,8 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
@@ -23,6 +25,21 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
printAnnotation(OS, Annot);
}
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
switch (reg) {
case AMDGPU::VCC:
@@ -41,43 +58,78 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
break;
}
- // It's seems there's no way to use SIRegisterInfo here, and dealing with the
- // giant enum of all the different shifted sets of registers is pretty
- // unmanagable, so parse the name and reformat it to be prettier.
- StringRef Name(getRegisterName(reg));
-
- std::pair<StringRef, StringRef> Split = Name.split('_');
- StringRef SubRegName = Split.first;
- StringRef Rest = Split.second;
+ char Type;
+ unsigned NumRegs;
+
+ if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+ Type = 'v';
+ NumRegs = 1;
+ } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+ Type = 's';
+ NumRegs = 1;
+ } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+ Type = 'v';
+ NumRegs = 2;
+ } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
+ Type = 's';
+ NumRegs = 2;
+ } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+ Type = 'v';
+ NumRegs = 4;
+ } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
+ Type = 's';
+ NumRegs = 4;
+ } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+ Type = 'v';
+ NumRegs = 3;
+ } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+ Type = 'v';
+ NumRegs = 8;
+ } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+ Type = 's';
+ NumRegs = 8;
+ } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+ Type = 'v';
+ NumRegs = 16;
+ } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+ Type = 's';
+ NumRegs = 16;
+ } else {
+ O << getRegisterName(reg);
+ return;
+ }
- if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
- O << Name;
+ // The low 8 bits encoding value is the register index, for both VGPRs and
+ // SGPRs.
+ unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
+ if (NumRegs == 1) {
+ O << Type << RegIdx;
return;
}
- unsigned RegIndex;
- StringRef RegIndexStr = SubRegName.drop_front(4);
+ O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
- if (RegIndexStr.getAsInteger(10, RegIndex)) {
- O << Name;
+void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+ int32_t SImm = static_cast<int32_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
return;
}
- if (SubRegName.front() == 'V')
- O << 'v';
- else if (SubRegName.front() == 'S')
- O << 's';
- else {
- O << Name;
+ if (Imm == FloatToBits(1.0f) ||
+ Imm == FloatToBits(-1.0f) ||
+ Imm == FloatToBits(0.5f) ||
+ Imm == FloatToBits(-0.5f) ||
+ Imm == FloatToBits(2.0f) ||
+ Imm == FloatToBits(-2.0f) ||
+ Imm == FloatToBits(4.0f) ||
+ Imm == FloatToBits(-4.0f)) {
+ O << BitsToFloat(Imm);
return;
}
- if (Rest.empty()) // Only 1 32-bit register
- O << RegIndex;
- else {
- unsigned NumReg = Rest.count('_') + 2;
- O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
- }
+ O << formatHex(static_cast<uint64_t>(Imm));
}
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -95,7 +147,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
break;
}
} else if (Op.isImm()) {
- O << Op.getImm();
+ printImmediate(Op.getImm(), O);
} else if (Op.isFPImm()) {
O << Op.getFPImm();
} else if (Op.isExpr()) {
@@ -106,6 +158,18 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+ if (InputModifiers & 0x1)
+ O << "-";
+ if (InputModifiers & 0x2)
+ O << "|";
+ printOperand(MI, OpNo + 1, O);
+ if (InputModifiers & 0x2)
+ O << "|";
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNum).getImm();
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 1d24680..6ca7170 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -29,13 +29,18 @@ public:
void printInstruction(const MCInst *MI, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
- virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+ void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
private:
- static void printRegOperand(unsigned RegNo, raw_ostream &O);
- static void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printRegOperand(unsigned RegNo, raw_ostream &O);
+ void printImmediate(uint32_t Imm, raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
- static void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
StringRef Asm, StringRef Default = "");
static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index a6bb59f..489cec7 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -23,8 +23,8 @@ namespace {
class AMDGPUMCObjectWriter : public MCObjectWriter {
public:
AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
- virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
- const MCAsmLayout &Layout) {
+ void ExecutePostLayoutBinding(MCAssembler &Asm,
+ const MCAsmLayout &Layout) override {
//XXX: Implement if necessary.
}
void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
@@ -34,7 +34,7 @@ public:
assert(!"Not implemented");
}
- virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+ void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
};
@@ -43,19 +43,19 @@ public:
AMDGPUAsmBackend(const Target &T)
: MCAsmBackend() {}
- virtual unsigned getNumFixupKinds() const { return 0; };
- virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel) const;
- virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
- const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const {
+ unsigned getNumFixupKinds() const override { return 0; };
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
return false;
}
- virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+ void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
assert(!"Not implemented");
}
- virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
- virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
return true;
}
};
@@ -88,7 +88,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
public:
ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
- MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
return createAMDGPUELFObjectWriter(OS);
}
};
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index aee9bd1..78bbe0a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -35,7 +35,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
Data16bitsDirective = ".short\t";
Data32bitsDirective = ".long\t";
Data64bitsDirective = ".quad\t";
- GPRel32Directive = 0;
+ GPRel32Directive = nullptr;
SunStyleELFSectionSwitchSyntax = true;
UsesELFSectionDirectiveForBSS = true;
@@ -58,5 +58,5 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
const MCSection*
AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
- return 0;
+ return nullptr;
}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 22afd63..59aebec 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -22,7 +22,7 @@ class StringRef;
class AMDGPUMCAsmInfo : public MCAsmInfo {
public:
explicit AMDGPUMCAsmInfo(StringRef &TT);
- const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+ const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
};
} // namespace llvm
#endif // AMDGPUMCASMINFO_H
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 6592b0e..38a2956 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -24,6 +24,8 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
#define GET_INSTRINFO_MC_DESC
#include "AMDGPUGenInstrInfo.inc"
@@ -33,8 +35,6 @@
#define GET_REGINFO_MC_DESC
#include "AMDGPUGenRegisterInfo.inc"
-using namespace llvm;
-
static MCInstrInfo *createAMDGPUMCInstrInfo() {
MCInstrInfo *X = new MCInstrInfo();
InitAMDGPUMCInstrInfo(X);
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
index b1beab0..74b8ca0 100644
--- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
@@ -19,5 +19,5 @@
type = Library
name = R600Desc
parent = R600
-required_libraries = R600AsmPrinter R600Info MC
+required_libraries = MC R600AsmPrinter R600Info Support
add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 286c7d1..5e7cefe 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -41,14 +41,14 @@ public:
: MCII(mcii), MRI(mri) { }
/// \brief Encode the instruction and write it to the OS.
- virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ const MCSubtargetInfo &STI) const override;
/// \returns the encoding for an MCOperand.
- virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
private:
void EmitByte(unsigned int byte, raw_ostream &OS) const;
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index f42e978..ee02111 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -54,14 +54,14 @@ public:
~SIMCCodeEmitter() { }
/// \brief Encode the instruction and write it to the OS.
- virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ const MCSubtargetInfo &STI) const override;
/// \returns the encoding for an MCOperand.
- virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
};
} // End anonymous namespace
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index fde4481..ce17d7c 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -106,3 +106,5 @@ def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>;
def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>;
def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 3d9015c..92bf0df 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -13,7 +13,6 @@
/// It needs to be called after IfCvt for best results.
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "r600mergeclause"
#include "AMDGPU.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
@@ -27,6 +26,8 @@
using namespace llvm;
+#define DEBUG_TYPE "r600mergeclause"
+
namespace {
static bool isCFAlu(const MachineInstr *MI) {
@@ -62,9 +63,9 @@ private:
public:
R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const;
+ const char *getPassName() const override;
};
char R600ClauseMergePass::ID = 0;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index f74bef3..d255e96 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -12,7 +12,6 @@
/// computing their address on the fly ; it also sets STACK_SIZE info.
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "r600cf"
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
#include "R600Defines.h"
@@ -26,6 +25,8 @@
using namespace llvm;
+#define DEBUG_TYPE "r600cf"
+
namespace {
struct CFStack {
@@ -468,13 +469,13 @@ private:
public:
R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
- TII (0), TRI(0),
+ TII (nullptr), TRI(nullptr),
ST(tm.getSubtarget<AMDGPUSubtarget>()) {
const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
MaxFetchInst = ST.getTexVTXClauseSize();
}
- virtual bool runOnMachineFunction(MachineFunction &MF) {
+ bool runOnMachineFunction(MachineFunction &MF) override {
TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
@@ -501,13 +502,13 @@ public:
DEBUG(dbgs() << CfCount << ":"; I->dump(););
FetchClauses.push_back(MakeFetchClause(MBB, I));
CfCount++;
- LastAlu.back() = 0;
+ LastAlu.back() = nullptr;
continue;
}
MachineBasicBlock::iterator MI = I;
if (MI->getOpcode() != AMDGPU::ENDIF)
- LastAlu.back() = 0;
+ LastAlu.back() = nullptr;
if (MI->getOpcode() == AMDGPU::CF_ALU)
LastAlu.back() = MI;
I++;
@@ -558,7 +559,7 @@ public:
break;
}
case AMDGPU::IF_PREDICATE_SET: {
- LastAlu.push_back(0);
+ LastAlu.push_back(nullptr);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_JUMP))
.addImm(0)
@@ -665,7 +666,7 @@ public:
return false;
}
- const char *getPassName() const {
+ const char *getPassName() const override {
return "R600 Control Flow Finalizer Pass";
}
};
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 5bd793a..38afebe 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -291,12 +291,12 @@ private:
public:
static char ID;
- R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(0), Address(0) {
+ R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
}
- virtual bool runOnMachineFunction(MachineFunction &MF) {
+ bool runOnMachineFunction(MachineFunction &MF) override {
TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
@@ -315,7 +315,7 @@ public:
return false;
}
- const char *getPassName() const {
+ const char *getPassName() const override {
return "R600 Emit Clause Markers Pass";
}
};
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index ca1189d..732b06d 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -38,11 +38,11 @@ private:
public:
R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
- TII(0) { }
+ TII(nullptr) { }
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const {
+ const char *getPassName() const override {
return "R600 Expand special instructions pass";
}
};
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 6405a82..d6c6830 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -82,9 +82,31 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT, MVT::i32, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+
+ // Expand sign extension of vectors
+ if (!Subtarget->hasBFE())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+ if (!Subtarget->hasBFE())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+ if (!Subtarget->hasBFE())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -117,6 +139,11 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ // These should be replaced by UDVIREM, but it does not happen automatically
+ // during Type Legalization
+ setOperationAction(ISD::UDIV, MVT::i64, Custom);
+ setOperationAction(ISD::UREM, MVT::i64, Custom);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -538,8 +565,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(2, MVT::i32), // SWZ_Z
DAG.getConstant(3, MVT::i32) // SWZ_W
};
- return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
- Args, 8);
+ return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
}
// default for switch(IntrinsicID)
@@ -689,7 +715,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
Op.getOperand(9),
Op.getOperand(10)
};
- return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+ return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
}
case AMDGPUIntrinsic::AMDGPU_dp4: {
SDValue Args[8] = {
@@ -710,7 +736,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
DAG.getConstant(3, MVT::i32))
};
- return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+ return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
}
case Intrinsic::r600_read_ngroups_x:
@@ -960,13 +986,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
}
-
- // Possible Min/Max pattern
- SDValue MinMax = LowerMinMax(Op, DAG);
- if (MinMax.getNode()) {
- return MinMax;
- }
-
// If we make it this for it means we have no native instructions to handle
// this SELECT_CC, so we must lower it.
SDValue HWTrue, HWFalse;
@@ -1088,10 +1107,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(0, MVT::i32),
Mask
};
- SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+ SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
SDValue Args[3] = { Chain, Input, DWordAddr };
return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
- Op->getVTList(), Args, 3, MemVT,
+ Op->getVTList(), Args, MemVT,
StoreNode->getMemOperand());
} else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
Value.getValueType().bitsGE(MVT::i32)) {
@@ -1131,7 +1150,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (ValueVT.isVector()) {
unsigned NumElemVT = ValueVT.getVectorNumElements();
EVT ElemVT = ValueVT.getVectorElementType();
- SDValue Stores[4];
+ SmallVector<SDValue, 4> Stores(NumElemVT);
assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
"vector width in load");
@@ -1148,7 +1167,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
Chain, Elem, Ptr,
DAG.getTargetConstant(Channel, MVT::i32));
}
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
} else {
if (ValueVT == MVT::i8) {
Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
@@ -1212,10 +1231,11 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
if (Ret.getNode()) {
- SDValue Ops[2];
- Ops[0] = Ret;
- Ops[1] = Chain;
- return DAG.getMergeValues(Ops, 2, DL);
+ SDValue Ops[2] = {
+ Ret,
+ Chain
+ };
+ return DAG.getMergeValues(Ops, DL);
}
@@ -1224,7 +1244,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
SplitVectorLoad(Op, DAG),
Chain
};
- return DAG.getMergeValues(MergedValues, 2, DL);
+ return DAG.getMergeValues(MergedValues, DL);
}
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1232,8 +1252,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
(LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
SDValue Result;
- if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
- isa<Constant>(LoadNode->getSrcValue()) ||
+ if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+ isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
isa<ConstantSDNode>(Ptr)) {
SDValue Slots[4];
for (unsigned i = 0; i < 4; i++) {
@@ -1252,7 +1272,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
NewVT = VT;
NumElements = VT.getVectorNumElements();
}
- Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
+ Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
+ makeArrayRef(Slots, NumElements));
} else {
// non-constant ptr can't be folded, keeps it as a v4f32 load
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
@@ -1268,10 +1289,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
}
SDValue MergedValues[2] = {
- Result,
- Chain
+ Result,
+ Chain
};
- return DAG.getMergeValues(MergedValues, 2, DL);
+ return DAG.getMergeValues(MergedValues, DL);
}
// For most operations returning SDValue() will result in the node being
@@ -1295,7 +1316,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
SDValue MergedValues[2] = { Sra, Chain };
- return DAG.getMergeValues(MergedValues, 2, DL);
+ return DAG.getMergeValues(MergedValues, DL);
}
if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1332,7 +1353,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
Loads[i] = DAG.getUNDEF(ElemVT);
}
EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
- LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+ LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
} else {
LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
Chain, Ptr,
@@ -1340,11 +1361,12 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
Op.getOperand(2));
}
- SDValue Ops[2];
- Ops[0] = LoweredLoad;
- Ops[1] = Chain;
+ SDValue Ops[2] = {
+ LoweredLoad,
+ Chain
+ };
- return DAG.getMergeValues(Ops, 2, DL);
+ return DAG.getMergeValues(Ops, DL);
}
/// XXX Only kernel functions are supported, so we can assume for now that
@@ -1365,8 +1387,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
SmallVector<ISD::InputArg, 8> LocalIns;
- getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
- LocalIns);
+ getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
AnalyzeFormalArguments(CCInfo, LocalIns);
@@ -1392,32 +1413,38 @@ SDValue R600TargetLowering::LowerFormalArguments(
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
- SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+
+ // FIXME: This should really check the extload type, but the handling of
+ // extload vecto parameters seems to be broken.
+ //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ ISD::LoadExtType Ext = ISD::SEXTLOAD;
+ SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
MachinePointerInfo(UndefValue::get(PtrTy)),
MemVT, false, false, 4);
- // 4 is the preferred alignment for
- // the CONSTANT memory space.
+
+ // 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
}
return Chain;
}
EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
- if (!VT.isVector()) return MVT::i32;
+ if (!VT.isVector())
+ return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}
-static SDValue
-CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
- DenseMap<unsigned, unsigned> &RemapSwizzle) {
+static SDValue CompactSwizzlableVector(
+ SelectionDAG &DAG, SDValue VectorEntry,
+ DenseMap<unsigned, unsigned> &RemapSwizzle) {
assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
assert(RemapSwizzle.empty());
SDValue NewBldVec[4] = {
- VectorEntry.getOperand(0),
- VectorEntry.getOperand(1),
- VectorEntry.getOperand(2),
- VectorEntry.getOperand(3)
+ VectorEntry.getOperand(0),
+ VectorEntry.getOperand(1),
+ VectorEntry.getOperand(2),
+ VectorEntry.getOperand(3)
};
for (unsigned i = 0; i < 4; i++) {
@@ -1448,7 +1475,7 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
}
return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
- VectorEntry.getValueType(), NewBldVec, 4);
+ VectorEntry.getValueType(), NewBldVec);
}
static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
@@ -1486,7 +1513,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
}
return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
- VectorEntry.getValueType(), NewBldVec, 4);
+ VectorEntry.getValueType(), NewBldVec);
}
@@ -1524,6 +1551,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
+ default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
// (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
case ISD::FP_ROUND: {
SDValue Arg = N->getOperand(0);
@@ -1613,8 +1641,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
}
// Return the new vector
- return DAG.getNode(ISD::BUILD_VECTOR, dl,
- VT, &Ops[0], Ops.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
// Extract_vec (Build_vector) generated by custom lowering
@@ -1638,6 +1665,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::SELECT_CC: {
+ // Try common optimizations
+ SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ if (Ret.getNode())
+ return Ret;
+
// fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
// selectcc x, y, a, b, inv(cc)
//
@@ -1697,7 +1729,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
};
SDLoc DL(N);
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
- return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+ return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
}
case AMDGPUISD::TEXTURE_FETCH: {
SDValue Arg = N->getOperand(1);
@@ -1727,10 +1759,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
};
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
- NewArgs, 19);
+ NewArgs);
}
}
- return SDValue();
+
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
static bool
@@ -1779,8 +1812,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
};
std::vector<unsigned> Consts;
- for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
- int OtherSrcIdx = SrcIndices[i];
+ for (int OtherSrcIdx : SrcIndices) {
int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
if (OtherSrcIdx < 0 || OtherSelIdx < 0)
continue;
@@ -1791,14 +1823,14 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
if (RegisterSDNode *Reg =
dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
if (Reg->getReg() == AMDGPU::ALU_CONST) {
- ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
- ParentNode->getOperand(OtherSelIdx));
+ ConstantSDNode *Cst
+ = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
Consts.push_back(Cst->getZExtValue());
}
}
}
- ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+ ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
Consts.push_back(Cst->getZExtValue());
if (!TII->fitsConstReadLimitations(Consts)) {
return false;
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 22ef728..a8a464f 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -24,21 +24,21 @@ class R600InstrInfo;
class R600TargetLowering : public AMDGPUTargetLowering {
public:
R600TargetLowering(TargetMachine &TM);
- virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
- MachineBasicBlock * BB) const;
- virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
- virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- virtual void ReplaceNodeResults(SDNode * N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const override;
- virtual SDValue LowerFormalArguments(
- SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const;
- virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const;
+ MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock * BB) const override;
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ void ReplaceNodeResults(SDNode * N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+ SDValue LowerFormalArguments(
+ SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SDLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
private:
unsigned Gen;
/// Each OpenCL kernel has nine implicit parameters that are stored in the
@@ -66,7 +66,7 @@ private:
void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
unsigned &Channel, unsigned &PtrIncr) const;
bool isZero(SDValue Op) const;
- virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+ SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
};
} // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 0281dd0..b0d9ae3 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -23,11 +23,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
#define GET_INSTRINFO_CTOR_DTOR
#include "AMDGPUGenDFAPacketizer.inc"
-using namespace llvm;
-
R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
: AMDGPUInstrInfo(tm),
RI(tm),
@@ -677,7 +677,7 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
return MI;
}
- return NULL;
+ return nullptr;
}
static
@@ -797,7 +797,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
DebugLoc DL) const {
assert(TBB && "InsertBranch must not be told to insert a fallthrough");
- if (FBB == 0) {
+ if (!FBB) {
if (Cond.empty()) {
BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
return 1;
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index d5ff4de..b5304a0 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -50,13 +50,13 @@ namespace llvm {
explicit R600InstrInfo(AMDGPUTargetMachine &tm);
- const R600RegisterInfo &getRegisterInfo() const;
- virtual void copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const;
+ const R600RegisterInfo &getRegisterInfo() const override;
+ void copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) const;
+ MachineBasicBlock::iterator MBBI) const override;
bool isTrig(const MachineInstr &MI) const;
bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -142,79 +142,79 @@ namespace llvm {
/// instruction slots within an instruction group.
bool isVector(const MachineInstr &MI) const;
- virtual unsigned getIEQOpcode() const;
- virtual bool isMov(unsigned Opcode) const;
+ unsigned getIEQOpcode() const override;
+ bool isMov(unsigned Opcode) const override;
DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
- const ScheduleDAG *DAG) const;
+ const ScheduleDAG *DAG) const override;
- bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+ SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
- unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
- unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+ unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
- bool isPredicated(const MachineInstr *MI) const;
+ bool isPredicated(const MachineInstr *MI) const override;
- bool isPredicable(MachineInstr *MI) const;
+ bool isPredicable(MachineInstr *MI) const override;
bool
isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
- const BranchProbability &Probability) const;
+ const BranchProbability &Probability) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
unsigned ExtraPredCycles,
- const BranchProbability &Probability) const ;
+ const BranchProbability &Probability) const override ;
bool
isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumTCycles, unsigned ExtraTCycles,
MachineBasicBlock &FMBB,
unsigned NumFCycles, unsigned ExtraFCycles,
- const BranchProbability &Probability) const;
+ const BranchProbability &Probability) const override;
bool DefinesPredicate(MachineInstr *MI,
- std::vector<MachineOperand> &Pred) const;
+ std::vector<MachineOperand> &Pred) const override;
bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
- const SmallVectorImpl<MachineOperand> &Pred2) const;
+ const SmallVectorImpl<MachineOperand> &Pred2) const override;
bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
- MachineBasicBlock &FMBB) const;
+ MachineBasicBlock &FMBB) const override;
bool PredicateInstruction(MachineInstr *MI,
- const SmallVectorImpl<MachineOperand> &Pred) const;
+ const SmallVectorImpl<MachineOperand> &Pred) const override;
- unsigned int getPredicationCost(const MachineInstr *) const;
+ unsigned int getPredicationCost(const MachineInstr *) const override;
unsigned int getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr *MI,
- unsigned *PredCost = 0) const;
+ unsigned *PredCost = nullptr) const override;
- virtual int getInstrLatency(const InstrItineraryData *ItinData,
- SDNode *Node) const { return 1;}
+ int getInstrLatency(const InstrItineraryData *ItinData,
+ SDNode *Node) const override { return 1;}
/// \brief Reserve the registers that may be accesed using indirect addressing.
void reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const;
- virtual unsigned calculateIndirectAddress(unsigned RegIndex,
- unsigned Channel) const;
+ unsigned calculateIndirectAddress(unsigned RegIndex,
+ unsigned Channel) const override;
- virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+ const TargetRegisterClass *getIndirectAddrRegClass() const override;
- virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
- MachineBasicBlock::iterator I,
- unsigned ValueReg, unsigned Address,
- unsigned OffsetReg) const;
+ MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg) const override;
- virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
- MachineBasicBlock::iterator I,
- unsigned ValueReg, unsigned Address,
- unsigned OffsetReg) const;
+ MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg) const override;
unsigned getMaxAlusPerClause() const;
@@ -244,7 +244,7 @@ namespace llvm {
MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
- unsigned DstReg, unsigned SrcReg) const;
+ unsigned DstReg, unsigned SrcReg) const override;
/// \brief Get the index of Op in the MachineInstr.
///
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d2075c0..590fde2 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1625,6 +1625,12 @@ def : DwordAddrPat <i32, R600_Reg32>;
} // End isR600toCayman Predicate
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
def getLDSNoRetOp : InstrMapping {
let FilterClass = "R600_LDS_1A1D";
let RowFields = ["BaseOp"];
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index c1bec0a..b0ae22e 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -21,7 +21,7 @@
namespace llvm {
class R600MachineFunctionInfo : public AMDGPUMachineFunction {
- virtual void anchor();
+ void anchor() override;
public:
R600MachineFunctionInfo(const MachineFunction &MF);
SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d3ffb50..d1655d1 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -12,8 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "misched"
-
#include "R600MachineScheduler.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -23,6 +21,8 @@
using namespace llvm;
+#define DEBUG_TYPE "misched"
+
void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
DAG = static_cast<ScheduleDAGMILive*>(dag);
@@ -56,7 +56,7 @@ unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
}
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
- SUnit *SU = 0;
+ SUnit *SU = nullptr;
NextInstKind = IDOther;
IsTopNode = false;
@@ -316,7 +316,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
if (Q.empty())
- return NULL;
+ return nullptr;
for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
It != E; ++It) {
SUnit *SU = *It;
@@ -331,7 +331,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
InstructionsGroupCandidate.pop_back();
}
}
- return NULL;
+ return nullptr;
}
void R600SchedStrategy::LoadAlu() {
@@ -448,11 +448,11 @@ SUnit* R600SchedStrategy::pickAlu() {
}
PrepareNextSlot();
}
- return NULL;
+ return nullptr;
}
SUnit* R600SchedStrategy::pickOther(int QID) {
- SUnit *SU = 0;
+ SUnit *SU = nullptr;
std::vector<SUnit *> &AQ = Available[QID];
if (AQ.empty()) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index b909ff7..fd475af 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -68,17 +68,16 @@ class R600SchedStrategy : public MachineSchedStrategy {
public:
R600SchedStrategy() :
- DAG(0), TII(0), TRI(0), MRI(0) {
+ DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
}
- virtual ~R600SchedStrategy() {
- }
+ virtual ~R600SchedStrategy() {}
- virtual void initialize(ScheduleDAGMI *dag);
- virtual SUnit *pickNode(bool &IsTopNode);
- virtual void schedNode(SUnit *SU, bool IsTopNode);
- virtual void releaseTopNode(SUnit *SU);
- virtual void releaseBottomNode(SUnit *SU);
+ void initialize(ScheduleDAGMI *dag) override;
+ SUnit *pickNode(bool &IsTopNode) override;
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+ void releaseTopNode(SUnit *SU) override;
+ void releaseBottomNode(SUnit *SU) override;
private:
std::vector<MachineInstr *> InstructionsGroupCandidate;
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index 767e5e3..2314136 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -27,7 +27,6 @@
/// to reduce MOV count.
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "vec-merger"
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
#include "R600InstrInfo.h"
@@ -42,6 +41,8 @@
using namespace llvm;
+#define DEBUG_TYPE "vec-merger"
+
namespace {
static bool
@@ -107,9 +108,9 @@ private:
public:
static char ID;
R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
- TII(0) { }
+ TII(nullptr) { }
- void getAnalysisUsage(AnalysisUsage &AU) const {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
@@ -118,11 +119,11 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
- const char *getPassName() const {
+ const char *getPassName() const override {
return "R600 Vector Registers Merge Pass";
}
- bool runOnMachineFunction(MachineFunction &Fn);
+ bool runOnMachineFunction(MachineFunction &Fn) override;
};
char R600VectorRegMerger::ID = 0;
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index b7b7610..c2f6c03 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -14,7 +14,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "packets"
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
#include "R600InstrInfo.h"
@@ -28,6 +27,8 @@
using namespace llvm;
+#define DEBUG_TYPE "packets"
+
namespace {
class R600Packetizer : public MachineFunctionPass {
@@ -36,7 +37,7 @@ public:
static char ID;
R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
- void getAnalysisUsage(AnalysisUsage &AU) const {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
@@ -45,11 +46,11 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
- const char *getPassName() const {
+ const char *getPassName() const override {
return "R600 Packetizer";
}
- bool runOnMachineFunction(MachineFunction &Fn);
+ bool runOnMachineFunction(MachineFunction &Fn) override;
};
char R600Packetizer::ID = 0;
@@ -155,18 +156,19 @@ public:
}
// initPacketizerState - initialize some internal flags.
- void initPacketizerState() {
+ void initPacketizerState() override {
ConsideredInstUsesAlreadyWrittenVectorElement = false;
}
// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
- bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
+ bool ignorePseudoInstruction(MachineInstr *MI,
+ MachineBasicBlock *MBB) override {
return false;
}
// isSoloInstruction - return true if instruction MI can not be packetized
// with any other instruction, which means that MI itself is a packet.
- bool isSoloInstruction(MachineInstr *MI) {
+ bool isSoloInstruction(MachineInstr *MI) override {
if (TII->isVector(*MI))
return true;
if (!TII->isALUInstr(MI->getOpcode()))
@@ -182,7 +184,7 @@ public:
// isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
// together.
- bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+ bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
if (getSlot(MII) == getSlot(MIJ))
ConsideredInstUsesAlreadyWrittenVectorElement = true;
@@ -219,7 +221,9 @@ public:
// isLegalToPruneDependencies - Is it legal to prune dependece between SUI
// and SUJ.
- bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
+ bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+ return false;
+ }
void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
@@ -288,7 +292,7 @@ public:
return true;
}
- MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+ MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
MachineBasicBlock::iterator FirstInBundle =
CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
const DenseMap<unsigned, unsigned> &PV =
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index c74c49e..52e1a4b 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -28,27 +28,28 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
R600RegisterInfo(AMDGPUTargetMachine &tm);
- virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
/// \param RC is an AMDIL reg class.
///
/// \returns the R600 reg class that is equivalent to \p RC.
- virtual const TargetRegisterClass *getISARegClass(
- const TargetRegisterClass *RC) const;
+ const TargetRegisterClass *getISARegClass(
+ const TargetRegisterClass *RC) const override;
/// \brief get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
- virtual unsigned getHWRegIndex(unsigned Reg) const;
+ unsigned getHWRegIndex(unsigned Reg) const override;
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
- virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+ const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
- virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+ const RegClassWeight &
+ getRegClassWeight(const TargetRegisterClass *RC) const override;
// \returns true if \p Reg can be defined in one ALU caluse and used in another.
- virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+ bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
};
} // End namespace llvm
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 9d24404..419ec8b 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -209,7 +209,7 @@ public:
FunctionPass(ID) {
}
- virtual bool doInitialization(Module &M) {
+ bool doInitialization(Module &M) override {
LLVMContext &Ctx = M.getContext();
Mod = &M;
FloatType = Type::getFloatTy(Ctx);
@@ -245,16 +245,16 @@ public:
return false;
}
- virtual bool runOnFunction(Function &F) {
+ bool runOnFunction(Function &F) override {
visit(F);
return false;
}
- virtual const char *getPassName() const {
+ const char *getPassName() const override {
return "R600 Texture Intrinsics Replacer";
}
- void getAnalysisUsage(AnalysisUsage &AU) const {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
}
void visitCallInst(CallInst &I) {
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index f9214a8..d6e4451 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -12,8 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "si-annotate-control-flow"
-
#include "AMDGPU.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/IR/Constants.h"
@@ -26,6 +24,8 @@
using namespace llvm;
+#define DEBUG_TYPE "si-annotate-control-flow"
+
namespace {
// Complex types used in this pass
@@ -91,15 +91,15 @@ public:
SIAnnotateControlFlow():
FunctionPass(ID) { }
- virtual bool doInitialization(Module &M);
+ bool doInitialization(Module &M) override;
- virtual bool runOnFunction(Function &F);
+ bool runOnFunction(Function &F) override;
- virtual const char *getPassName() const {
+ const char *getPassName() const override {
return "SI annotate control flow";
}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
@@ -118,7 +118,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
Void = Type::getVoidTy(Context);
Boolean = Type::getInt1Ty(Context);
Int64 = Type::getInt64Ty(Context);
- ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
+ ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
BoolTrue = ConstantInt::getTrue(Context);
BoolFalse = ConstantInt::getFalse(Context);
@@ -126,25 +126,25 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
Int64Zero = ConstantInt::get(Int64, 0);
If = M.getOrInsertFunction(
- IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
+ IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
Else = M.getOrInsertFunction(
- ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
+ ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
Break = M.getOrInsertFunction(
- BreakIntrinsic, Int64, Int64, (Type *)0);
+ BreakIntrinsic, Int64, Int64, (Type *)nullptr);
IfBreak = M.getOrInsertFunction(
- IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
+ IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
ElseBreak = M.getOrInsertFunction(
- ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
+ ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
Loop = M.getOrInsertFunction(
- LoopIntrinsic, Boolean, Int64, (Type *)0);
+ LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
EndCf = M.getOrInsertFunction(
- EndCfIntrinsic, Void, Int64, (Type *)0);
+ EndCfIntrinsic, Void, Int64, (Type *)nullptr);
return false;
}
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 402f1f4..5f71453 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -65,7 +65,6 @@
/// ultimately led to the creation of an illegal COPY.
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "sgpr-copies"
#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -77,6 +76,8 @@
using namespace llvm;
+#define DEBUG_TYPE "sgpr-copies"
+
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
@@ -97,9 +98,9 @@ private:
public:
SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const {
+ const char *getPassName() const override {
return "SI Fix SGPR copies";
}
@@ -184,7 +185,8 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
const TargetRegisterClass *SrcRC;
if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- DstRC == &AMDGPU::M0RegRegClass)
+ DstRC == &AMDGPU::M0RegRegClass ||
+ MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
return false;
SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
@@ -256,6 +258,19 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TII->moveToVALU(MI);
break;
}
+ case AMDGPU::INSERT_SUBREG: {
+ const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+ DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+ Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+ Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+ if (TRI->isSGPRClass(DstRC) &&
+ (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+ DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
+ DEBUG(MI.print(dbgs()));
+ TII->moveToVALU(MI);
+ }
+ break;
+ }
}
}
}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0b55411..c9e247c 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -29,22 +29,21 @@ using namespace llvm;
SITargetLowering::SITargetLowering(TargetMachine &TM) :
AMDGPUTargetLowering(TM) {
- addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
+ addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+ addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
- addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
- addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
+ addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
- addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
- addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -78,8 +77,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::ADDC, MVT::i32, Legal);
setOperationAction(ISD::ADDE, MVT::i32, Legal);
- setOperationAction(ISD::BITCAST, MVT::i128, Legal);
-
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -99,10 +96,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::i1, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::i64, Custom);
- setOperationAction(ISD::STORE, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -119,6 +117,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
@@ -126,39 +140,48 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
setTruncStoreAction(MVT::i32, MVT::i8, Custom);
setTruncStoreAction(MVT::i32, MVT::i16, Custom);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
- setTruncStoreAction(MVT::i128, MVT::i64, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+ setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+ // These should use UDIVREM, so set them to expand
+ setOperationAction(ISD::UDIV, MVT::i64, Expand);
+ setOperationAction(ISD::UREM, MVT::i64, Expand);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
MVT VecTypes[] = {
MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
};
- const size_t NumVecTypes = array_lengthof(VecTypes);
- for (unsigned Type = 0; Type < NumVecTypes; ++Type) {
+ for (MVT VT : VecTypes) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch(Op) {
case ISD::LOAD:
@@ -172,7 +195,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
case ISD::EXTRACT_SUBVECTOR:
break;
default:
- setOperationAction(Op, VecTypes[Type], Expand);
+ setOperationAction(Op, VT, Expand);
break;
}
}
@@ -189,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::f64, Legal);
}
setTargetDAGCombine(ISD::SELECT_CC);
@@ -204,10 +228,40 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
bool *IsFast) const {
+ if (IsFast)
+ *IsFast = false;
+
// XXX: This depends on the address space and also we may want to revist
// the alignment values we specify in the DataLayout.
+
+ // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+ // which isn't a simple VT.
if (!VT.isSimple() || VT == MVT::Other)
return false;
+
+ // XXX - CI changes say "Support for unaligned memory accesses" but I don't
+ // see what for specifically. The wording everywhere else seems to be the
+ // same.
+
+ // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have
+ // no alignment restrictions.
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+ // Using any pair of GPRs should be the same as any other pair.
+ if (IsFast)
+ *IsFast = true;
+ return VT.bitsGE(MVT::i64);
+ }
+
+ // XXX - The only mention I see of this in the ISA manual is for LDS direct
+ // reads the "byte address and must be dword aligned". Is it also true for the
+ // normal loads and stores?
+ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
+ return false;
+
+ // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+ // byte-address are ignored, thus forcing Dword alignment.
+ if (IsFast)
+ *IsFast = true;
return VT.bitsGT(MVT::i32);
}
@@ -224,7 +278,7 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDLoc DL, SDValue Chain,
- unsigned Offset) const {
+ unsigned Offset, bool Signed) const {
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
AMDGPUAS::CONSTANT_ADDRESS);
@@ -232,7 +286,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(Offset, MVT::i64));
- return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+ return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
false, false, MemVT.getSizeInBits() >> 3);
@@ -340,7 +394,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
- 36 + VA.getLocMemOffset());
+ 36 + VA.getLocMemOffset(),
+ Ins[i].Flags.isSExt());
InVals.push_back(Arg);
continue;
}
@@ -381,8 +436,7 @@ SDValue SITargetLowering::LowerFormalArguments(
for (unsigned j = 0; j != NumElements; ++j)
Regs.push_back(DAG.getUNDEF(VT));
- InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
- Regs.data(), Regs.size()));
+ InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
continue;
}
@@ -395,15 +449,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineBasicBlock::iterator I = *MI;
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH: return BB;
case AMDGPU::SI_ADDR64_RSRC: {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
unsigned SuperReg = MI->getOperand(0).getReg();
unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
@@ -428,9 +482,7 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MI->eraseFromParent();
break;
}
- case AMDGPU::V_SUB_F64: {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ case AMDGPU::V_SUB_F64:
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
@@ -442,11 +494,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
.addImm(2); /* NEG */
MI->eraseFromParent();
break;
- }
+
case AMDGPU::SI_RegisterStorePseudo: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstrBuilder MIB =
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
@@ -455,6 +505,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MIB.addOperand(MI->getOperand(i));
MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::FABS_SI: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+ Reg)
+ .addImm(0x7fffffff);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
+ MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(Reg);
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::FNEG_SI: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+ Reg)
+ .addImm(0x80000000);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
+ MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addReg(Reg);
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::FCLAMP_SI: {
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
+ MI->getOperand(0).getReg())
+ .addImm(0) // SRC0 modifiers
+ .addOperand(MI->getOperand(1))
+ .addImm(0) // SRC1 modifiers
+ .addImm(0) // SRC1
+ .addImm(1) // CLAMP
+ .addImm(0); // OMOD
+ MI->eraseFromParent();
}
}
return BB;
@@ -510,7 +604,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
SplitVectorLoad(Op, DAG),
Load->getChain()
};
- return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+ return DAG.getMergeValues(MergedValues, SDLoc(Op));
} else {
return LowerLOAD(Op, DAG);
}
@@ -533,23 +627,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (IntrinsicID) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case Intrinsic::r600_read_ngroups_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
case Intrinsic::r600_read_ngroups_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
case Intrinsic::r600_read_ngroups_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
case Intrinsic::r600_read_global_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
case Intrinsic::r600_read_global_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
case Intrinsic::r600_read_global_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
case Intrinsic::r600_read_local_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
case Intrinsic::r600_read_local_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
case Intrinsic::r600_read_local_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
@@ -570,7 +664,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
AMDGPU::VGPR2, VT);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops [] = {
- ResourceDescriptorToi128(Op.getOperand(1), DAG),
+ Op.getOperand(1),
Op.getOperand(2)
};
@@ -579,7 +673,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
VT.getSizeInBits() / 8, 4);
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
- Op->getVTList(), Ops, 2, VT, MMO);
+ Op->getVTList(), Ops, VT, MMO);
}
case AMDGPUIntrinsic::SI_sample:
return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
@@ -591,7 +685,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
case AMDGPUIntrinsic::SI_vs_load_input:
return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
- ResourceDescriptorToi128(Op.getOperand(1), DAG),
+ Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
}
@@ -606,7 +700,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Ops [] = {
Chain,
- ResourceDescriptorToi128(Op.getOperand(2), DAG),
+ Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
@@ -627,8 +721,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
MachineMemOperand::MOStore,
VT.getSizeInBits() / 8, 4);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops,
- sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+ Op->getVTList(), Ops, VT, MMO);
}
default:
break;
@@ -650,7 +743,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
if (I->getOpcode() == Opcode)
return *I;
}
- return 0;
+ return nullptr;
}
/// This transforms the control flow intrinsics to get the branch destination as
@@ -662,7 +755,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
SDNode *Intr = BRCOND.getOperand(1).getNode();
SDValue Target = BRCOND.getOperand(2);
- SDNode *BR = 0;
+ SDNode *BR = nullptr;
if (Intr->getOpcode() == ISD::SETCC) {
// As long as we negate the condition everything is fine
@@ -695,7 +788,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
// build the new intrinsic call
SDNode *Result = DAG.getNode(
Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
- DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
+ DAG.getVTList(Res), Ops).getNode();
if (BR) {
// Give the branch instruction our target
@@ -703,7 +796,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
BR->getOperand(0),
BRCOND.getOperand(2)
};
- DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
+ DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
}
SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -739,7 +832,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
MergedValues[1] = Load->getChain();
if (Ret.getNode()) {
MergedValues[0] = Ret;
- return DAG.getMergeValues(MergedValues, 2, DL);
+ return DAG.getMergeValues(MergedValues, DL);
}
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -770,30 +863,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
}
MergedValues[0] = Ret;
- return DAG.getMergeValues(MergedValues, 2, DL);
+ return DAG.getMergeValues(MergedValues, DL);
}
-SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
- SelectionDAG &DAG) const {
-
- if (Op.getValueType() == MVT::i128) {
- return Op;
- }
-
- assert(Op.getOpcode() == ISD::UNDEF);
-
- return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
- DAG.getConstant(0, MVT::i64),
- DAG.getConstant(0, MVT::i64));
-}
-
SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
const SDValue &Op,
SelectionDAG &DAG) const {
return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
Op.getOperand(2),
- ResourceDescriptorToi128(Op.getOperand(3), DAG),
+ Op.getOperand(3),
Op.getOperand(4));
}
@@ -833,12 +912,6 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
- // Possible Min/Max pattern
- SDValue MinMax = LowerMinMax(Op, DAG);
- if (MinMax.getNode()) {
- return MinMax;
- }
-
SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
}
@@ -948,8 +1021,12 @@ SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
return SDValue();
}
- return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0),
- DAG.getConstant(0, MVT::i32));
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueType() != MVT::i32)
+ Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+
+ SDValue Zero = DAG.getConstant(0, MVT::i32);
+ return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
}
//===----------------------------------------------------------------------===//
@@ -963,7 +1040,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
EVT VT = N->getValueType(0);
switch (N->getOpcode()) {
- default: break;
+ default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
case ISD::SELECT_CC: {
ConstantSDNode *True, *False;
// i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
@@ -982,7 +1059,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
SDValue Arg0 = N->getOperand(0);
SDValue Arg1 = N->getOperand(1);
SDValue CC = N->getOperand(2);
- ConstantSDNode * C = NULL;
+ ConstantSDNode * C = nullptr;
ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
// i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
@@ -998,7 +1075,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
break;
}
}
- return SDValue();
+
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
/// \brief Test if RegClass is one of the VSrc classes
@@ -1029,9 +1107,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
return -1;
}
Imm.I = Node->getSExtValue();
- } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+ } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+ if (N->getValueType(0) != MVT::f32)
+ return -1;
Imm.F = Node->getValueAPF().convertToFloat();
- else
+ } else
return -1; // It isn't an immediate
if ((Imm.I >= -16 && Imm.I <= 64) ||
@@ -1051,7 +1131,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
const SIInstrInfo *TII =
static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
+ if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
return false;
const SDValue &Op = Mov->getOperand(0);
@@ -1098,7 +1178,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode(
}
return TRI.getPhysRegClass(Reg);
}
- default: return NULL;
+ default: return nullptr;
}
}
const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
@@ -1202,17 +1282,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
// Commuted opcode if available
int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
- const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev);
+ const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
assert(!DescRev || DescRev->getNumDefs() == NumDefs);
assert(!DescRev || DescRev->getNumOperands() == NumOps);
// e64 version if available, -1 otherwise
int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
- const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
+ const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
+ int InputModifiers[3] = {0};
assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
- assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
bool HaveVSrc = false, HaveSSrc = false;
@@ -1279,17 +1359,18 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
fitsRegClass(DAG, Ops[1], OtherRegClass))) {
// Swap commutable operands
- SDValue Tmp = Ops[1];
- Ops[1] = Ops[0];
- Ops[0] = Tmp;
+ std::swap(Ops[0], Ops[1]);
Desc = DescRev;
- DescRev = 0;
+ DescRev = nullptr;
continue;
}
}
- if (DescE64 && !Immediate) {
+ if (Immediate)
+ continue;
+
+ if (DescE64) {
// Test if it makes sense to switch to e64 encoding
unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
@@ -1305,14 +1386,46 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
Immediate = -1;
Promote2e64 = true;
Desc = DescE64;
- DescE64 = 0;
+ DescE64 = nullptr;
}
}
+
+ if (!DescE64 && !Promote2e64)
+ continue;
+ if (!Operand.isMachineOpcode())
+ continue;
+ if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
+ Ops.pop_back();
+ Ops.push_back(Operand.getOperand(0));
+ InputModifiers[i] = 1;
+ Promote2e64 = true;
+ if (!DescE64)
+ continue;
+ Desc = DescE64;
+ DescE64 = 0;
+ }
+ else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
+ Ops.pop_back();
+ Ops.push_back(Operand.getOperand(0));
+ InputModifiers[i] = 2;
+ Promote2e64 = true;
+ if (!DescE64)
+ continue;
+ Desc = DescE64;
+ DescE64 = 0;
+ }
}
if (Promote2e64) {
+ std::vector<SDValue> OldOps(Ops);
+ Ops.clear();
+ for (unsigned i = 0; i < OldOps.size(); ++i) {
+ // src_modifier
+ Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
+ Ops.push_back(OldOps[i]);
+ }
// Add the modifier flags while promoting
- for (unsigned i = 0; i < 4; ++i)
+ for (unsigned i = 0; i < 2; ++i)
Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
}
@@ -1390,7 +1503,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
Ops.push_back(Node->getOperand(i));
- Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
+ Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
// If we only got one lane, replace it with a copy
// (if NewDmask has only one bit set...)
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index ca73f53..c6eaa81 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -22,7 +22,7 @@ namespace llvm {
class SITargetLowering : public AMDGPUTargetLowering {
SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
- SDValue Chain, unsigned Offset) const;
+ SDValue Chain, unsigned Offset, bool Signed) const;
SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -33,7 +33,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
bool foldImm(SDValue &Operand, int32_t &Immediate,
bool &ScalarSlotUsed) const;
const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
@@ -49,32 +48,33 @@ class SITargetLowering : public AMDGPUTargetLowering {
public:
SITargetLowering(TargetMachine &tm);
- bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const;
- virtual bool shouldSplitVectorType(EVT VT) const override;
+ bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+ bool *IsFast) const override;
+ bool shouldSplitVectorType(EVT VT) const override;
- virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
- Type *Ty) const override;
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const;
+ SmallVectorImpl<SDValue> &InVals) const override;
- virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
- MachineBasicBlock * BB) const;
- virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
- virtual MVT getScalarShiftAmountTy(EVT VT) const;
- virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
- virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
- virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
- virtual void AdjustInstrPostInstrSelection(MachineInstr *MI,
- SDNode *Node) const;
+ MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+ MachineBasicBlock * BB) const override;
+ EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+ MVT getScalarShiftAmountTy(EVT VT) const override;
+ bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+ void AdjustInstrPostInstrSelection(MachineInstr *MI,
+ SDNode *Node) const override;
int32_t analyzeImmediate(const SDNode *N) const;
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const;
+ unsigned Reg, EVT VT) const override;
};
} // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 695ec40..a17fed7 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -97,13 +97,13 @@ private:
public:
SIInsertWaits(TargetMachine &tm) :
MachineFunctionPass(ID),
- TII(0),
- TRI(0),
+ TII(nullptr),
+ TRI(nullptr),
ExpInstrTypesSeen(0) { }
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const {
+ const char *getPassName() const override {
return "SI insert wait instructions";
}
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index aa2c22c..168eff2 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -12,7 +12,7 @@
//===----------------------------------------------------------------------===//
class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
- AMDGPUInst<outs, ins, asm, pattern> {
+ AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
field bits<1> VM_CNT = 0;
field bits<1> EXP_CNT = 0;
@@ -210,16 +210,19 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
Enc64 <outs, ins, asm, pattern> {
bits<8> dst;
+ bits<2> src0_modifiers;
bits<9> src0;
+ bits<2> src1_modifiers;
bits<9> src1;
+ bits<2> src2_modifiers;
bits<9> src2;
- bits<3> abs;
bits<1> clamp;
bits<2> omod;
- bits<3> neg;
let Inst{7-0} = dst;
- let Inst{10-8} = abs;
+ let Inst{8} = src0_modifiers{1};
+ let Inst{9} = src1_modifiers{1};
+ let Inst{10} = src2_modifiers{1};
let Inst{11} = clamp;
let Inst{25-17} = op;
let Inst{31-26} = 0x34; //encoding
@@ -227,7 +230,9 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{49-41} = src1;
let Inst{58-50} = src2;
let Inst{60-59} = omod;
- let Inst{63-61} = neg;
+ let Inst{61} = src0_modifiers{0};
+ let Inst{62} = src1_modifiers{0};
+ let Inst{63} = src2_modifiers{0};
let mayLoad = 0;
let mayStore = 0;
@@ -240,12 +245,14 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
Enc64 <outs, ins, asm, pattern> {
bits<8> dst;
+ bits<2> src0_modifiers;
bits<9> src0;
+ bits<2> src1_modifiers;
bits<9> src1;
+ bits<2> src2_modifiers;
bits<9> src2;
bits<7> sdst;
bits<2> omod;
- bits<3> neg;
let Inst{7-0} = dst;
let Inst{14-8} = sdst;
@@ -255,7 +262,9 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{49-41} = src1;
let Inst{58-50} = src2;
let Inst{60-59} = omod;
- let Inst{63-61} = neg;
+ let Inst{61} = src0_modifiers{0};
+ let Inst{62} = src1_modifiers{0};
+ let Inst{63} = src2_modifiers{0};
let mayLoad = 0;
let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index ab2fe09..4a9e346 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -187,27 +187,45 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned KillFlag = isKill ? RegState::Kill : 0;
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
- unsigned Lane = MFI->SpillTracker.getNextLane(MRI);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
- MFI->SpillTracker.LaneVGPR)
+ unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
.addReg(SrcReg, KillFlag)
.addImm(Lane);
+ MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+ } else if (RI.isSGPRClass(RC)) {
+ // We are only allowed to create one new instruction when spilling
+ // registers, so we need to use pseudo instruction for vector
+ // registers.
+ //
+ // Reserve a spot in the spill tracker for each sub-register of
+ // the vector register.
+ unsigned NumSubRegs = RC->getSize() / 4;
+ unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
+ NumSubRegs);
MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
- Lane);
- } else {
- for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
- unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg)
- .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
- storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i,
- &AMDGPU::SReg_32RegClass, TRI);
+ FirstLane);
+
+ unsigned Opcode;
+ switch (RC->getSize() * 8) {
+ case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+ default: llvm_unreachable("Cannot spill register class");
}
+
+ BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+ .addReg(SrcReg)
+ .addImm(FrameIndex);
+ } else {
+ llvm_unreachable("VGPR spilling not supported");
}
}
@@ -216,30 +234,125 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
DebugLoc DL = MBB.findDebugLoc(MI);
if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
- SIMachineFunctionInfo::SpilledReg Spill =
+ SIMachineFunctionInfo::SpilledReg Spill =
MFI->SpillTracker.getSpilledReg(FrameIndex);
assert(Spill.VGPR);
BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
.addReg(Spill.VGPR)
.addImm(Spill.Lane);
+ insertNOPs(MI, 3);
+ } else if (RI.isSGPRClass(RC)){
+ unsigned Opcode;
+ switch(RC->getSize() * 8) {
+ case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+ default: llvm_unreachable("Cannot spill register class");
+ }
+
+ SIMachineFunctionInfo::SpilledReg Spill =
+ MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addReg(Spill.VGPR)
+ .addImm(FrameIndex);
+ insertNOPs(MI, 3);
} else {
- for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
- unsigned Flags = RegState::Define;
- if (i == 0) {
- Flags |= RegState::Undef;
- }
- unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i,
- &AMDGPU::SReg_32RegClass, TRI);
- BuildMI(MBB, MI, DL, get(AMDGPU::COPY))
- .addReg(DestReg, Flags, RI.getSubRegFromChannel(i))
- .addReg(SubReg);
+ llvm_unreachable("VGPR spilling not supported");
+ }
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+ switch (Op) {
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ return 16;
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ return 8;
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ return 4;
+ case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ return 2;
+ default: llvm_unreachable("Invalid spill opcode");
+ }
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+ int Count) const {
+ while (Count > 0) {
+ int Arg;
+ if (Count >= 8)
+ Arg = 7;
+ else
+ Arg = Count - 1;
+ Count -= 8;
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+ .addImm(Arg);
+ }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+ SIMachineFunctionInfo *MFI =
+ MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ switch (MI->getOpcode()) {
+ default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+ // SGPR register spill
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S64_SAVE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned FrameIndex = MI->getOperand(2).getImm();
+
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ SIMachineFunctionInfo::SpilledReg Spill;
+ unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
+ &AMDGPU::SGPR_32RegClass, i);
+ Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+ MI->getOperand(0).getReg())
+ .addReg(SubReg)
+ .addImm(Spill.Lane + i);
+ }
+ MI->eraseFromParent();
+ break;
+ }
+
+ // SGPR register restore
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S64_RESTORE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ SIMachineFunctionInfo::SpilledReg Spill;
+ unsigned FrameIndex = MI->getOperand(2).getImm();
+ unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
+ &AMDGPU::SGPR_32RegClass, i);
+ Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(Spill.Lane + i);
}
+ MI->eraseFromParent();
+ break;
}
+ }
+ return true;
}
MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
@@ -247,18 +360,18 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
- return 0;
+ return nullptr;
// Cannot commute VOP2 if src0 is SGPR.
if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
- return 0;
+ return nullptr;
if (!MI->getOperand(2).isReg()) {
// XXX: Commute instructions with FPImm operands
if (NewMI || MI->getOperand(2).isFPImm() ||
(!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
- return 0;
+ return nullptr;
}
// XXX: Commute VOP3 instructions with abs and neg set.
@@ -267,7 +380,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
AMDGPU::OpName::abs)).getImm() ||
MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::neg)).getImm()))
- return 0;
+ return nullptr;
unsigned Reg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
@@ -516,6 +629,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
case AMDGPU::COPY: return AMDGPU::COPY;
case AMDGPU::PHI: return AMDGPU::PHI;
+ case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::S_MOV_B32:
return MI.getOperand(1).isReg() ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -536,6 +650,23 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+ case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+ case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+ case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+ case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+ case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+ case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+ case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+ case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+ case AMDGPU::S_LOAD_DWORD_IMM:
+ case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+ case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+ case AMDGPU::S_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
}
}
@@ -559,6 +690,8 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::PHI:
+ case AMDGPU::INSERT_SUBREG:
return RI.hasVGPRs(getOpRegClass(MI, 0));
default:
return RI.hasVGPRs(getOpRegClass(MI, OpNo));
@@ -737,11 +870,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
}
}
- // Legalize REG_SEQUENCE
+ // Legalize REG_SEQUENCE and PHI
// The register class of the operands much be the same type as the register
// class of the output.
- if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
- const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+ if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
+ MI->getOpcode() == AMDGPU::PHI) {
+ const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
if (!MI->getOperand(i).isReg() ||
!TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
@@ -774,13 +908,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
!TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
continue;
unsigned DstReg = MRI.createVirtualRegister(RC);
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ MachineBasicBlock *InsertBB;
+ MachineBasicBlock::iterator Insert;
+ if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+ InsertBB = MI->getParent();
+ Insert = MI;
+ } else {
+ // MI is a PHI instruction.
+ InsertBB = MI->getOperand(i + 1).getMBB();
+ Insert = InsertBB->getFirstTerminator();
+ }
+ BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
get(AMDGPU::COPY), DstReg)
.addOperand(MI->getOperand(i));
MI->getOperand(i).setReg(DstReg);
}
}
+ // Legalize INSERT_SUBREG
+ // src0 must have the same register class as dst
+ if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
+ unsigned Dst = MI->getOperand(0).getReg();
+ unsigned Src0 = MI->getOperand(1).getReg();
+ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+ if (DstRC != Src0RC) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+ .addReg(Src0);
+ MI->getOperand(1).setReg(NewSrc0);
+ }
+ return;
+ }
+
// Legalize MUBUF* instructions
// FIXME: If we start using the non-addr64 instructions for compute, we
// may need to legalize them here.
@@ -886,6 +1047,72 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
}
}
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ switch (MI->getOpcode()) {
+ case AMDGPU::S_LOAD_DWORD_IMM:
+ case AMDGPU::S_LOAD_DWORD_SGPR:
+ case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_LOAD_DWORDX4_SGPR:
+ unsigned NewOpcode = getVALUOp(*MI);
+ unsigned RegOffset;
+ unsigned ImmOffset;
+
+ if (MI->getOperand(2).isReg()) {
+ RegOffset = MI->getOperand(2).getReg();
+ ImmOffset = 0;
+ } else {
+ assert(MI->getOperand(2).isImm());
+ // SMRD instructions take a dword offsets and MUBUF instructions
+ // take a byte offset.
+ ImmOffset = MI->getOperand(2).getImm() << 2;
+ RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ if (isUInt<12>(ImmOffset)) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+ RegOffset)
+ .addImm(0);
+ } else {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+ RegOffset)
+ .addImm(ImmOffset);
+ ImmOffset = 0;
+ }
+ }
+
+ unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ unsigned DWord0 = RegOffset;
+ unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
+ .addImm(0);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
+ .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
+ .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
+ .addReg(DWord0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DWord1)
+ .addImm(AMDGPU::sub1)
+ .addReg(DWord2)
+ .addImm(AMDGPU::sub2)
+ .addReg(DWord3)
+ .addImm(AMDGPU::sub3);
+ MI->setDesc(get(NewOpcode));
+ if (MI->getOperand(2).isReg()) {
+ MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+ } else {
+ MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+ }
+ MI->getOperand(1).setReg(SRsrc);
+ MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+ }
+}
+
void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
SmallVector<MachineInstr *, 128> Worklist;
Worklist.push_back(&TopInst);
@@ -895,8 +1122,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
MachineBasicBlock *MBB = Inst->getParent();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Opcode = Inst->getOpcode();
+ unsigned NewOpcode = getVALUOp(*Inst);
+
// Handle some special cases
- switch(Inst->getOpcode()) {
+ switch (Opcode) {
+ default:
+ if (isSMRD(Inst->getOpcode())) {
+ moveSMRDToVALU(Inst, MRI);
+ }
+ break;
case AMDGPU::S_MOV_B64: {
DebugLoc DL = Inst->getDebugLoc();
@@ -947,7 +1182,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
llvm_unreachable("Moving this op to VALU not implemented");
}
- unsigned NewOpcode = getVALUOp(*Inst);
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
@@ -968,27 +1202,52 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
Inst->RemoveOperand(i);
}
- // Add the implict and explicit register definitions.
- if (NewDesc.ImplicitUses) {
- for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
- unsigned Reg = NewDesc.ImplicitUses[i];
- Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
- }
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ Inst->addOperand(Inst->getOperand(1));
+ Inst->getOperand(1).ChangeToImmediate(0);
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(Size));
+
+ // XXX - Other pointless operands. There are 4, but it seems you only need
+ // 3 to not hit an assertion later in MCInstLower.
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
}
- if (NewDesc.ImplicitDefs) {
- for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
- unsigned Reg = NewDesc.ImplicitDefs[i];
- Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
- }
+ addDescImplicitUseDef(NewDesc, Inst);
+
+ if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second operand
+ // back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+ Inst->RemoveOperand(2); // Remove old immediate.
+ Inst->addOperand(Inst->getOperand(1));
+ Inst->getOperand(1).ChangeToImmediate(0);
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(Offset));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+ Inst->addOperand(MachineOperand::CreateImm(0));
+ Inst->addOperand(MachineOperand::CreateImm(0));
}
- legalizeOperands(Inst);
-
// Update the destination register class.
+
const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
- switch (Inst->getOpcode()) {
+ switch (Opcode) {
// For target instructions, getOpRegClass just returns the virtual
// register class associated with the operand, so we need to find an
// equivalent VGPR register class in order to move the instruction to the
@@ -996,6 +1255,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
case AMDGPU::COPY:
case AMDGPU::PHI:
case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
if (RI.hasVGPRs(NewDstRC))
continue;
NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -1010,6 +1270,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
+ // Legalize the operands
+ legalizeOperands(Inst);
+
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
E = MRI.use_end(); I != E; ++I) {
MachineInstr &UseMI = *I->getParent();
@@ -1097,6 +1360,24 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
Worklist.push_back(HiHalf);
}
+void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
+ MachineInstr *Inst) const {
+ // Add the implict and explicit register definitions.
+ if (NewDesc.ImplicitUses) {
+ for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+ unsigned Reg = NewDesc.ImplicitUses[i];
+ Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+ }
+ }
+
+ if (NewDesc.ImplicitDefs) {
+ for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+ unsigned Reg = NewDesc.ImplicitDefs[i];
+ Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+ }
+ }
+}
+
MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index c537038..7b31a81 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -47,49 +47,52 @@ private:
void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
MachineInstr *Inst, unsigned Opcode) const;
+ void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
public:
explicit SIInstrInfo(AMDGPUTargetMachine &tm);
- const SIRegisterInfo &getRegisterInfo() const {
+ const SIRegisterInfo &getRegisterInfo() const override {
return RI;
}
- virtual void copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const;
+ void copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const;
+ const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const;
+ const TargetRegisterInfo *TRI) const override;
+
+ virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
unsigned commuteOpcode(unsigned Opcode) const;
- virtual MachineInstr *commuteInstruction(MachineInstr *MI,
- bool NewMI=false) const;
+ MachineInstr *commuteInstruction(MachineInstr *MI,
+ bool NewMI=false) const override;
bool isTriviallyReMaterializable(const MachineInstr *MI,
- AliasAnalysis *AA = 0) const;
+ AliasAnalysis *AA = nullptr) const;
- virtual unsigned getIEQOpcode() const {
+ unsigned getIEQOpcode() const override {
llvm_unreachable("Unimplemented");
}
MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
- unsigned DstReg, unsigned SrcReg) const;
- virtual bool isMov(unsigned Opcode) const;
+ unsigned DstReg, unsigned SrcReg) const override;
+ bool isMov(unsigned Opcode) const override;
- virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
bool isDS(uint16_t Opcode) const;
int isMIMG(uint16_t Opcode) const;
int isSMRD(uint16_t Opcode) const;
@@ -101,8 +104,8 @@ public:
bool isInlineConstant(const MachineOperand &MO) const;
bool isLiteralConstant(const MachineOperand &MO) const;
- virtual bool verifyInstruction(const MachineInstr *MI,
- StringRef &ErrInfo) const;
+ bool verifyInstruction(const MachineInstr *MI,
+ StringRef &ErrInfo) const override;
bool isSALUInstr(const MachineInstr &MI) const;
static unsigned getVALUOp(const MachineInstr &MI);
@@ -136,32 +139,36 @@ public:
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr *MI) const;
+ void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+
/// \brief Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary.
void moveToVALU(MachineInstr &MI) const;
- virtual unsigned calculateIndirectAddress(unsigned RegIndex,
- unsigned Channel) const;
+ unsigned calculateIndirectAddress(unsigned RegIndex,
+ unsigned Channel) const override;
- virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+ const TargetRegisterClass *getIndirectAddrRegClass() const override;
- virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
- MachineBasicBlock::iterator I,
- unsigned ValueReg,
- unsigned Address,
- unsigned OffsetReg) const;
+ MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg,
+ unsigned Address,
+ unsigned OffsetReg) const override;
- virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
- MachineBasicBlock::iterator I,
- unsigned ValueReg,
- unsigned Address,
- unsigned OffsetReg) const;
+ MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg,
+ unsigned Address,
+ unsigned OffsetReg) const override;
void reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const;
void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
unsigned SavReg, unsigned IndexReg) const;
+
+ void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
};
namespace AMDGPU {
@@ -169,6 +176,7 @@ namespace AMDGPU {
int getVOPe64(uint16_t Opcode);
int getCommuteRev(uint16_t Opcode);
int getCommuteOrig(uint16_t Opcode);
+ int getMCOpcode(uint16_t Opcode, unsigned Gen);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index e05ab65..2242e6d 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -7,23 +7,25 @@
//
//===----------------------------------------------------------------------===//
+// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
+// in AMDGPUMCInstLower.h
+def SISubtarget {
+ int NONE = -1;
+ int SI = 0;
+}
+
//===----------------------------------------------------------------------===//
// SI DAG Nodes
//===----------------------------------------------------------------------===//
-// SMRD takes a 64bit memory address and can only add an 32bit offset
-def SIadd64bit32bit : SDNode<"ISD::ADD",
- SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
->;
-
def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
- SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+ SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
[SDNPMayLoad, SDNPMemOperand]
>;
def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
SDTypeProfile<0, 13,
- [SDTCisVT<0, i128>, // rsrc(SGPR)
+ [SDTCisVT<0, v4i32>, // rsrc(SGPR)
SDTCisVT<1, iAny>, // vdata(VGPR)
SDTCisVT<2, i32>, // num_channels(imm)
SDTCisVT<3, i32>, // vaddr(VGPR)
@@ -41,13 +43,13 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
>;
def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
- SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
SDTCisVT<3, i32>]>
>;
class SDSample<string opcode> : SDNode <opcode,
SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
- SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+ SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
>;
def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
@@ -111,14 +113,17 @@ def IMM16bit : PatLeaf <(imm),
[{return isUInt<16>(N->getZExtValue());}]
>;
+def IMM32bit : PatLeaf <(imm),
+ [{return isUInt<32>(N->getZExtValue());}]
+>;
+
def mubuf_vaddr_offset : PatFrag<
(ops node:$ptr, node:$offset, node:$imm_offset),
(add (add node:$ptr, node:$offset), node:$imm_offset)
>;
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
- return
- (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
+ return isInlineImmediate(N);
}]>;
class SGPRImm <dag frag> : PatLeaf<frag, [{
@@ -138,7 +143,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
}]>;
def FRAMEri32 : Operand<iPTR> {
- let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+ let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
}
//===----------------------------------------------------------------------===//
@@ -197,15 +202,17 @@ class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
opName#" $dst, $src0, $src1", pattern
>;
-class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
- op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
-class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
- op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
+class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+ string opName, PatLeaf cond> : SOPC <
+ op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
+ opName#" $dst, $src0, $src1", []>;
+
+class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+ : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
+
+class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+ : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
op, (outs SReg_32:$dst), (ins i16imm:$src0),
@@ -221,7 +228,7 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
RegisterClass dstClass> {
def _IMM : SMRD <
op, 1, (outs dstClass:$dst),
- (ins baseClass:$sbase, i32imm:$offset),
+ (ins baseClass:$sbase, u32imm:$offset),
asm#" $dst, $sbase, $offset", []
>;
@@ -245,6 +252,28 @@ class VOP2_REV <string revOp, bit isOrig> {
bit IsOrig = isOrig;
}
+class SIMCInstr <string pseudo, int subtarget> {
+ string PseudoInstr = pseudo;
+ int Subtarget = subtarget;
+}
+
+multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName> {
+
+ def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+ SIMCInstr<OpName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ }
+
+ def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>;
+
+}
+
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+ let PrintMethod = "printOperandAndMods";
+}
+
multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
string opName, list<dag> pattern> {
@@ -256,10 +285,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
def _e64 : VOP3 <
{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
(outs drc:$dst),
- (ins src:$src0,
- i32imm:$abs, i32imm:$clamp,
- i32imm:$omod, i32imm:$neg),
- opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
+ (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
+ opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", []
>, VOP <opName> {
let src1 = SIOperand.ZERO;
let src2 = SIOperand.ZERO;
@@ -288,10 +315,10 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
def _e64 : VOP3 <
{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
(outs vrc:$dst),
- (ins arc:$src0, arc:$src1,
- i32imm:$abs, i32imm:$clamp,
- i32imm:$omod, i32imm:$neg),
- opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+ (ins InputMods:$src0_modifiers, arc:$src0,
+ InputMods:$src1_modifiers, arc:$src1,
+ i32imm:$clamp, i32imm:$omod),
+ opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
>, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
let src2 = SIOperand.ZERO;
}
@@ -316,10 +343,10 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
def _e64 : VOP3b <
{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
(outs VReg_32:$dst),
- (ins VSrc_32:$src0, VSrc_32:$src1,
- i32imm:$abs, i32imm:$clamp,
- i32imm:$omod, i32imm:$neg),
- opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+ (ins InputMods: $src0_modifiers, VSrc_32:$src0,
+ InputMods:$src1_modifiers, VSrc_32:$src1,
+ i32imm:$clamp, i32imm:$omod),
+ opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
>, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
let src2 = SIOperand.ZERO;
/* the VOP2 variant puts the carry out into VCC, the VOP3 variant
@@ -340,15 +367,16 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
def _e64 : VOP3 <
{0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
(outs SReg_64:$dst),
- (ins arc:$src0, arc:$src1,
- InstFlag:$abs, InstFlag:$clamp,
- InstFlag:$omod, InstFlag:$neg),
- opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg",
+ (ins InputMods:$src0_modifiers, arc:$src0,
+ InputMods:$src1_modifiers, arc:$src1,
+ InstFlag:$clamp, InstFlag:$omod),
+ opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
!if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
[(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
)
>, VOP <opName> {
let src2 = SIOperand.ZERO;
+ let src2_modifiers = 0;
}
}
@@ -360,12 +388,13 @@ multiclass VOPC_64 <bits<8> op, string opName,
ValueType vt = untyped, PatLeaf cond = COND_NULL>
: VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
-class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
op, (outs VReg_32:$dst),
- (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
- InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
- opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
->, VOP <opName>;
+ (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
+ VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
+ InstFlag:$clamp, InstFlag:$omod),
+ opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+>;
class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_64:$dst),
@@ -374,10 +403,9 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
>, VOP <opName> {
let src2 = SIOperand.ZERO;
- let abs = 0;
+ let src0_modifiers = 0;
let clamp = 0;
let omod = 0;
- let neg = 0;
}
class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
@@ -403,7 +431,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
op,
(outs regClass:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, i16imm:$offset),
+ (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
asm#" $vdst, $addr, $offset, [M0]",
[]> {
let data0 = 0;
@@ -415,7 +443,7 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
op,
(outs regClass:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, i8imm:$offset0, i8imm:$offset1),
+ (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
[]> {
let data0 = 0;
@@ -427,7 +455,7 @@ class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
op,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i16imm:$offset),
+ (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
asm#" $addr, $data0, $offset [M0]",
[]> {
let data1 = 0;
@@ -439,7 +467,7 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
op,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i8imm:$offset0, i8imm:$offset1),
+ (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
[]> {
let mayStore = 1;
@@ -450,7 +478,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
op,
(outs rc:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i16imm:$offset),
+ (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
asm#" $vdst, $addr, $data0, $offset, [M0]",
[]> {
@@ -462,7 +490,7 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
op,
(outs),
- (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+ (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -481,7 +509,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
let offen = 0, idxen = 0 in {
def _OFFSET : MUBUF <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc, VReg_32:$vaddr,
- i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+ u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
i1imm:$slc, i1imm:$tfe),
asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
}
@@ -497,7 +525,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
let offen = 0, idxen = 1 in {
def _IDXEN : MUBUF <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc, VReg_32:$vaddr,
- i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+ u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
i1imm:$slc, i1imm:$tfe),
asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
}
@@ -513,7 +541,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+ (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
asm#" $vdata, $srsrc + $vaddr + $offset", []>;
}
}
@@ -521,7 +549,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
- i16imm:$offset),
+ u16imm:$offset),
name#" $vdata, $srsrc + $vaddr + $offset",
[]> {
@@ -542,7 +570,7 @@ class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
op,
(outs regClass:$dst),
- (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+ (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -677,4 +705,12 @@ def isDS : InstrMapping {
let ValueCols = [["8"]];
}
+def getMCOpcode : InstrMapping {
+ let FilterClass = "SIMCInstr";
+ let RowFields = ["PseudoInstr"];
+ let ColFields = ["Subtarget"];
+ let KeyCol = [!cast<string>(SISubtarget.NONE)];
+ let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+}
+
include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 5232139..500fa78 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -32,9 +32,56 @@ def isSI : Predicate<"Subtarget.getGeneration() "
def isCI : Predicate<"Subtarget.getGeneration() "
">= AMDGPUSubtarget::SEA_ISLANDS">;
+def isCFDepth0 : Predicate<"isCFDepth0()">;
+
def WAIT_FLAG : InstFlag<"printWaitFlag">;
-let Predicates = [isSI] in {
+let SubtargetPredicate = isSI in {
+let OtherPredicates = [isCFDepth0] in {
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+ 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+ 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+ 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+ 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+ 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
let neverHasSideEffects = 1 in {
@@ -45,7 +92,10 @@ def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
} // End isMoveImm = 1
-def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
+ [(set i32:$dst, (not i32:$src0))]
+>;
+
def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
@@ -65,8 +115,13 @@ def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
-//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
-//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+ [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+ [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+>;
+
////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
@@ -99,6 +154,150 @@ def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+ [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+ [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+ [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+ [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+ [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+ [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+ [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+ [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
+
+def S_CSELECT_B32 : SOP2 <
+ 0x0000000a, (outs SReg_32:$dst),
+ (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+ []
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+ [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+ [(set i64:$dst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+ [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+ [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+ [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+ [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+ [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+ [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+ [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+ [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+ [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+ [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
@@ -147,6 +346,108 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
//def EXP : EXP_ <0x00000000, "EXP", []>;
+} // End let OtherPredicates = [isCFDepth0]
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+ [(IL_retflag)]> {
+ let SIMM16 = 0;
+ let isBarrier = 1;
+ let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+ 0x00000002, (ins brtarget:$target), "S_BRANCH $target",
+ [(br bb:$target)]> {
+ let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+ 0x00000004, (ins brtarget:$target, SCCReg:$scc),
+ "S_CBRANCH_SCC0 $target", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+ 0x00000005, (ins brtarget:$target, SCCReg:$scc),
+ "S_CBRANCH_SCC1 $target",
+ []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+ 0x00000006, (ins brtarget:$target, VCCReg:$vcc),
+ "S_CBRANCH_VCCZ $target",
+ []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+ 0x00000007, (ins brtarget:$target, VCCReg:$vcc),
+ "S_CBRANCH_VCCNZ $target",
+ []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+ 0x00000008, (ins brtarget:$target, EXECReg:$exec),
+ "S_CBRANCH_EXECZ $target",
+ []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+ 0x00000009, (ins brtarget:$target, EXECReg:$exec),
+ "S_CBRANCH_EXECNZ $target",
+ []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+ [(int_AMDGPU_barrier_local)]
+> {
+ let SIMM16 = 0;
+ let isBarrier = 1;
+ let hasCtrlDep = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
+ []
+>;
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+
+let Uses = [EXEC] in {
+ def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+ [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+ > {
+ let DisableEncoding = "$m0";
+ }
+} // End Uses = [EXEC]
+
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+} // End hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
let isCompare = 1 in {
defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
@@ -403,6 +704,10 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
} // End isCompare = 1
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+
def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
@@ -427,6 +732,9 @@ def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -499,6 +807,11 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
@@ -508,41 +821,10 @@ def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FOR
def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
-let mayLoad = 1 in {
-
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
-// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
-
-defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
- 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
->;
-
-defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
- 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
- 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
- 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
- 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
->;
-
-} // mayLoad = 1
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
-//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
@@ -638,8 +920,12 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
-//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
let neverHasSideEffects = 1, isMoveImm = 1 in {
defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
@@ -691,8 +977,13 @@ defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
-//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
-//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
+ [(set i32:$dst, (fp_to_uint f64:$src0))]
+>;
+defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
+ [(set f64:$dst, (uint_to_fp i32:$src0))]
+>;
+
defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
[(set f32:$dst, (AMDGPUfract f32:$src0))]
>;
@@ -756,6 +1047,11 @@ defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
def V_INTERP_P1_F32 : VINTRP <
0x00000000,
(outs VReg_32:$dst),
@@ -786,97 +1082,9 @@ def V_INTERP_MOV_F32 : VINTRP <
let DisableEncoding = "$m0";
}
-//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
-
-let isTerminator = 1 in {
-
-def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
- [(IL_retflag)]> {
- let SIMM16 = 0;
- let isBarrier = 1;
- let hasCtrlDep = 1;
-}
-
-let isBranch = 1 in {
-def S_BRANCH : SOPP <
- 0x00000002, (ins brtarget:$target), "S_BRANCH $target",
- [(br bb:$target)]> {
- let isBarrier = 1;
-}
-
-let DisableEncoding = "$scc" in {
-def S_CBRANCH_SCC0 : SOPP <
- 0x00000004, (ins brtarget:$target, SCCReg:$scc),
- "S_CBRANCH_SCC0 $target", []
->;
-def S_CBRANCH_SCC1 : SOPP <
- 0x00000005, (ins brtarget:$target, SCCReg:$scc),
- "S_CBRANCH_SCC1 $target",
- []
->;
-} // End DisableEncoding = "$scc"
-
-def S_CBRANCH_VCCZ : SOPP <
- 0x00000006, (ins brtarget:$target, VCCReg:$vcc),
- "S_CBRANCH_VCCZ $target",
- []
->;
-def S_CBRANCH_VCCNZ : SOPP <
- 0x00000007, (ins brtarget:$target, VCCReg:$vcc),
- "S_CBRANCH_VCCNZ $target",
- []
->;
-
-let DisableEncoding = "$exec" in {
-def S_CBRANCH_EXECZ : SOPP <
- 0x00000008, (ins brtarget:$target, EXECReg:$exec),
- "S_CBRANCH_EXECZ $target",
- []
->;
-def S_CBRANCH_EXECNZ : SOPP <
- 0x00000009, (ins brtarget:$target, EXECReg:$exec),
- "S_CBRANCH_EXECNZ $target",
- []
->;
-} // End DisableEncoding = "$exec"
-
-
-} // End isBranch = 1
-} // End isTerminator = 1
-
-let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
- [(int_AMDGPU_barrier_local)]
-> {
- let SIMM16 = 0;
- let isBarrier = 1;
- let hasCtrlDep = 1;
- let mayLoad = 1;
- let mayStore = 1;
-}
-
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
- []
->;
-//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
-//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
-//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
-
-let Uses = [EXEC] in {
- def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
- [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
- > {
- let DisableEncoding = "$m0";
- }
-} // End Uses = [EXEC]
-
-//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
-//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
-//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
-//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
-//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
-//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
-} // End hasSideEffects
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
(ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
@@ -891,18 +1099,11 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
"V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
[(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
->;
-
-//f32 pattern for V_CNDMASK_B32_e64
-def : Pat <
- (f32 (select i1:$src2, f32:$src1, f32:$src0)),
- (V_CNDMASK_B32_e64 $src0, $src1, $src2)
->;
-
-def : Pat <
- (i32 (trunc i64:$val)),
- (EXTRACT_SUBREG $val, sub0)
->;
+> {
+ let src0_modifiers = 0;
+ let src1_modifiers = 0;
+ let src2_modifiers = 0;
+}
def V_READLANE_B32 : VOP2 <
0x00000001,
@@ -946,11 +1147,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
- [(set i32:$dst, (mul I24:$src0, I24:$src1))]
+ [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
>;
//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
- [(set i32:$dst, (mul U24:$src0, U24:$src1))]
+ [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
>;
//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
@@ -965,27 +1166,43 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+ [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+ [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+ [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+ [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
+
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
+ [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
+ [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
let hasPostISelHook = 1 in {
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+ [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
}
defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+ [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+ [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+ [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
} // End isCommutable = 1
@@ -1001,14 +1218,18 @@ defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", [], VSrc_32>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", [], VSrc_32>;
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+ [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+ [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
"V_SUB_I32">;
let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", [], VReg_32>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", [], VReg_32>;
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
+ [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
+ [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
"V_SUBB_U32">;
} // End Uses = [VCC]
@@ -1023,63 +1244,51 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
>;
////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
let neverHasSideEffects = 1 in {
-def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
-def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
- [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
+defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
+ [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
+defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
+ [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
>;
-def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
- [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
+defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
+ [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
>;
} // End neverHasSideEffects
-def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
-def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
-def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
-def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+
+defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
[(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
-def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
[(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
}
-def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
+defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
[(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
-defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
[(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
>;
def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
[(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
>;
//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
-def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
+defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
-def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
@@ -1092,9 +1301,9 @@ def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
-def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
@@ -1116,181 +1325,46 @@ def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
} // isCommutable = 1
-def : Pat <
- (fadd f64:$src0, f64:$src1),
- (V_ADD_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
- (fmul f64:$src0, f64:$src1),
- (V_MUL_F64 $src0, $src1, (i64 0))
->;
-
def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
let isCommutable = 1 in {
-def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
-def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
-def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
-def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
} // isCommutable = 1
-def : Pat <
- (mul i32:$src0, i32:$src1),
- (V_MUL_LO_I32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
- (mulhu i32:$src0, i32:$src1),
- (V_MUL_HI_U32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
- (mulhs i32:$src0, i32:$src1),
- (V_MUL_HI_I32 $src0, $src1, (i32 0))
->;
-
-def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
-def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
-let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
- [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
->;
-} // End isCommutable = 1
-
-def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
- [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
->;
-
-let Uses = [SCC] in { // Carry in comes from SCC
-let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
- [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End isCommutable = 1
-
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
- [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End Uses = [SCC]
-} // End Defs = [SCC]
-
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
- [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
->;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
- [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
->;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
- [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
->;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
- [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
->;
-
-def S_CSELECT_B32 : SOP2 <
- 0x0000000a, (outs SReg_32:$dst),
- (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
- []
->;
-
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
-
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
- [(set i32:$dst, (and i32:$src0, i32:$src1))]
->;
-
-def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
- [(set i64:$dst, (and i64:$src0, i64:$src1))]
->;
-
-def : Pat <
- (i1 (and i1:$src0, i1:$src1)),
- (S_AND_B64 $src0, $src1)
->;
-
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
- [(set i32:$dst, (or i32:$src0, i32:$src1))]
->;
-
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
- [(set i64:$dst, (or i64:$src0, i64:$src1))]
->;
-
-def : Pat <
- (i1 (or i1:$src0, i1:$src1)),
- (S_OR_B64 $src0, $src1)
->;
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
- [(set i32:$dst, (xor i32:$src0, i32:$src1))]
->;
+let isCodeGenOnly = 1, isPseudo = 1 in {
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
- [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+def V_MOV_I1 : InstSI <
+ (outs VReg_1:$dst),
+ (ins i1imm:$src),
+ "", [(set i1:$dst, (imm:$src))]
>;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
-
-// Use added complexity so these patterns are preferred to the VALU patterns.
-let AddedComplexity = 1 in {
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
- [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
- [(set i64:$dst, (shl i64:$src0, i32:$src1))]
->;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
- [(set i32:$dst, (srl i32:$src0, i32:$src1))]
->;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
- [(set i64:$dst, (srl i64:$src0, i32:$src1))]
->;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
- [(set i32:$dst, (sra i32:$src0, i32:$src1))]
->;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
- [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+def V_AND_I1 : InstSI <
+ (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+ [(set i1:$dst, (and i1:$src0, i1:$src1))]
>;
-} // End AddedComplexity = 1
-
-def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
-def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
-def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
-
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-def LOAD_CONST : AMDGPUShaderInst <
- (outs GPRF32:$dst),
- (ins i32imm:$src),
- "LOAD_CONST $dst, $src",
- [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+def V_OR_I1 : InstSI <
+ (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+ [(set i1:$dst, (or i1:$src0, i1:$src1))]
>;
// SI pseudo instructions. These are used by the CFG structurizer pass
@@ -1301,19 +1375,19 @@ let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
let isBranch = 1, isTerminator = 1 in {
-def SI_IF : InstSI <
+def SI_IF: InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$vcc, brtarget:$target),
- "SI_IF $dst, $vcc, $target",
+ "",
[(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
>;
def SI_ELSE : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src, brtarget:$target),
- "SI_ELSE $dst, $src, $target",
- [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> {
-
+ "",
+ [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
+> {
let Constraints = "$src = $dst";
}
@@ -1370,7 +1444,7 @@ let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
let UseNamedOperandTable = 1 in {
-def SI_RegisterLoad : AMDGPUShaderInst <
+def SI_RegisterLoad : InstSI <
(outs VReg_32:$dst, SReg_64:$temp),
(ins FRAMEri32:$addr, i32imm:$chan),
"", []
@@ -1379,7 +1453,7 @@ def SI_RegisterLoad : AMDGPUShaderInst <
let mayLoad = 1;
}
-class SIRegStore<dag outs> : AMDGPUShaderInst <
+class SIRegStore<dag outs> : InstSI <
outs,
(ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
"", []
@@ -1439,8 +1513,33 @@ def V_SUB_F64 : InstSI <
} // end usesCustomInserter
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+ def _SAVE : InstSI <
+ (outs VReg_32:$dst),
+ (ins sgpr_class:$src, i32imm:$frame_idx),
+ "", []
+ >;
+
+ def _RESTORE : InstSI <
+ (outs sgpr_class:$dst),
+ (ins VReg_32:$src, i32imm:$frame_idx),
+ "", []
+ >;
+
+}
+
+defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
} // end IsCodeGenOnly, isPseudo
+} // end SubtargetPredicate = SI
+
+let Predicates = [isSI] in {
+
def : Pat<
(int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
(V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
@@ -1453,7 +1552,7 @@ def : Pat <
/* int_SI_vs_load_input */
def : Pat<
- (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+ (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
(BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
>;
@@ -1470,40 +1569,116 @@ def : Pat <
(V_SUB_F64 $src0, $src1)
>;
+//===----------------------------------------------------------------------===//
+// SMRD Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+ // 1. Offset as 8bit DWORD immediate
+ def : Pat <
+ (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+ (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+ >;
+
+ // 2. Offset loaded in an 32bit SGPR
+ def : Pat <
+ (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+ (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+ >;
+
+ // 3. No offset at all
+ def : Pat <
+ (constant_load i64:$sbase),
+ (vt (Instr_IMM $sbase, 0))
+ >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, imm:$offset),
+ (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+ (or i64:$src0, i64:$src1),
+ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0),
+ (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
+ (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1),
+ (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
+>;
+
+class SextInReg <ValueType vt, int ShiftAmt> : Pat <
+ (sext_inreg i32:$src0, vt),
+ (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
+>;
+
+def : SextInReg <i8, 24>;
+def : SextInReg <i16, 16>;
+
/********** ======================= **********/
/********** Image sampling patterns **********/
/********** ======================= **********/
/* SIsample for simple 1D texture lookup */
def : Pat <
- (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+ (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
(IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
+ (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
(opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
+ (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
(opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
- (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
+ (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
(opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SampleShadowPattern<SDNode name, MIMG opcode,
ValueType vt> : Pat <
- (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
+ (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
(opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SampleShadowArrayPattern<SDNode name, MIMG opcode,
ValueType vt> : Pat <
- (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
+ (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
(opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
@@ -1692,8 +1867,6 @@ def : BitConvert <i64, v2i32, VReg_64>;
def : BitConvert <v4f32, v4i32, VReg_128>;
def : BitConvert <v4i32, v4f32, VReg_128>;
-def : BitConvert <v4i32, i128, VReg_128>;
-def : BitConvert <i128, v4i32, VReg_128>;
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, SReg_256>;
@@ -1711,10 +1884,18 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
/********** Src & Dst modifiers **********/
/********** =================== **********/
+def FCLAMP_SI : AMDGPUShaderInst <
+ (outs VReg_32:$dst),
+ (ins VSrc_32:$src0),
+ "FCLAMP_SI $dst, $src0",
+ []
+> {
+ let usesCustomInserter = 1;
+}
+
def : Pat <
(int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
- (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
- 0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+ (FCLAMP_SI f32:$src)
>;
/********** ================================ **********/
@@ -1733,14 +1914,32 @@ def : Pat <
(V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
>;
+def FABS_SI : AMDGPUShaderInst <
+ (outs VReg_32:$dst),
+ (ins VSrc_32:$src0),
+ "FABS_SI $dst, $src0",
+ []
+> {
+ let usesCustomInserter = 1;
+}
+
def : Pat <
(fabs f32:$src),
- (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) /* Clear sign bit */
+ (FABS_SI f32:$src)
>;
+def FNEG_SI : AMDGPUShaderInst <
+ (outs VReg_32:$dst),
+ (ins VSrc_32:$src0),
+ "FNEG_SI $dst, $src0",
+ []
+> {
+ let usesCustomInserter = 1;
+}
+
def : Pat <
(fneg f32:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Toggle sign bit */
+ (FNEG_SI f32:$src)
>;
/********** ================== **********/
@@ -1768,30 +1967,10 @@ def : Pat <
>;
def : Pat <
- (i1 imm:$imm),
- (S_MOV_B64 imm:$imm)
->;
-
-def : Pat <
(i64 InlineImm<i64>:$imm),
(S_MOV_B64 InlineImm<i64>:$imm)
>;
-// i64 immediates aren't supported in hardware, split it into two 32bit values
-def : Pat <
- (i64 imm:$imm),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
- (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
->;
-
-def : Pat <
- (f64 fpimm:$imm),
- (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
- (V_MOV_B32_e32 (f32 (LO32f fpimm:$imm))), sub0),
- (V_MOV_B32_e32 (f32 (HI32f fpimm:$imm))), sub1)
->;
-
/********** ===================== **********/
/********** Interpolation Paterns **********/
/********** ===================== **********/
@@ -1875,21 +2054,9 @@ class Ext32Pat <SDNode ext> : Pat <
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
-// 1. Offset as 8bit DWORD immediate
+// Offset in an 32Bit VGPR
def : Pat <
- (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : Pat <
- (SIload_constant i128:$sbase, imm:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
->;
-
-// 3. Offset in an 32Bit VGPR
-def : Pat <
- (SIload_constant i128:$sbase, i32:$voff),
+ (SIload_constant v4i32:$sbase, i32:$voff),
(BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
>;
@@ -1904,18 +2071,44 @@ def : Pat <
def : Pat <
(int_SI_tid),
(V_MBCNT_HI_U32_B32_e32 0xffffffff,
- (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0))
+ (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
>;
-/********** ================== **********/
-/********** VOP3 Patterns **********/
-/********** ================== **********/
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
def : Pat <
- (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)),
- (V_MAD_F32 $src0, $src1, $src2)
+ (fadd f64:$src0, f64:$src1),
+ (V_ADD_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+ (fmul f64:$src0, f64:$src1),
+ (V_MUL_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+ (mul i32:$src0, i32:$src1),
+ (V_MUL_LO_I32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+ (mulhu i32:$src0, i32:$src1),
+ (V_MUL_HI_U32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+ (mulhs i32:$src0, i32:$src1),
+ (V_MUL_HI_I32 $src0, $src1, (i32 0))
>;
+defm : BFIPatterns <V_BFI_B32>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
/********** ======================= **********/
/********** Load/Store Patterns **********/
/********** ======================= **********/
@@ -1962,41 +2155,6 @@ def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
(DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
-/********** ================== **********/
-/********** SMRD Patterns **********/
-/********** ================== **********/
-
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
- // 1. Offset as 8bit DWORD immediate
- def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
- (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
- >;
-
- // 2. Offset loaded in an 32bit SGPR
- def : Pat <
- (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)),
- (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset)))
- >;
-
- // 3. No offset at all
- def : Pat <
- (constant_load i64:$sbase),
- (vt (Instr_IMM $sbase, 0))
- >;
-}
-
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-
//===----------------------------------------------------------------------===//
// MUBUF Patterns
//===----------------------------------------------------------------------===//
@@ -2083,7 +2241,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
MUBUF bothen> {
def : Pat <
- (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
imm:$offset, 0, 0, imm:$glc, imm:$slc,
imm:$tfe)),
(offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2091,7 +2249,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
>;
def : Pat <
- (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
imm, 1, 0, imm:$glc, imm:$slc,
imm:$tfe)),
(offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2099,7 +2257,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
>;
def : Pat <
- (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
imm:$offset, 0, 1, imm:$glc, imm:$slc,
imm:$tfe)),
(idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2107,7 +2265,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
>;
def : Pat <
- (vt (int_SI_buffer_load_dword i128:$rsrc, v2i32:$vaddr, i32:$soffset,
+ (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
imm, 1, 1, imm:$glc, imm:$slc,
imm:$tfe)),
(bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2128,7 +2286,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_
// TBUFFER_STORE_FORMAT_*, addr64=0
class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
- (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+ (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
i32:$soffset, imm:$inst_offset, imm:$dfmt,
imm:$nfmt, imm:$offen, imm:$idxen,
imm:$glc, imm:$slc, imm:$tfe),
@@ -2156,12 +2314,13 @@ defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
[(set f64:$dst, (ffloor f64:$src0))]
>;
+defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
+ [(set f64:$dst, (frint f64:$src0))]
+>;
-defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", []>;
-
-def V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
-def V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
-def V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
+defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
+defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
+defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
// XXX - Does this set VCC?
@@ -2248,17 +2407,43 @@ def : Pat<
>;
//===----------------------------------------------------------------------===//
-// Miscellaneous Patterns
+// Conversion Patterns
//===----------------------------------------------------------------------===//
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+ (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+
+// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+// might not be worth the effort, and will need to expand to shifts when
+// fixing SGPR copies.
+
+// Handle sext_inreg in i64
def : Pat <
- (i64 (trunc i128:$x)),
+ (i64 (sext_inreg i64:$src, i1)),
(INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
- (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+ (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
+ (S_MOV_B32 -1), sub1)
>;
def : Pat <
+ (i64 (sext_inreg i64:$src, i8)),
+ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+ (S_MOV_B32 -1), sub1)
+>;
+
+def : Pat <
+ (i64 (sext_inreg i64:$src, i16)),
+ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+ (S_MOV_B32 -1), sub1)
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
(i32 (trunc i64:$a)),
(EXTRACT_SUBREG $a, sub0)
>;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2f8696..6601f2a 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -67,7 +67,7 @@ private:
static const unsigned SkipThreshold = 12;
static char ID;
- const TargetRegisterInfo *TRI;
+ const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -92,11 +92,11 @@ private:
public:
SILowerControlFlowPass(TargetMachine &tm) :
- MachineFunctionPass(ID), TRI(0), TII(0) { }
+ MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
- virtual bool runOnMachineFunction(MachineFunction &MF);
+ bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const {
+ const char *getPassName() const override {
return "SI Lower control flow instructions";
}
@@ -427,7 +427,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
- TRI = MF.getTarget().getRegisterInfo();
+ TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
new file mode 100644
index 0000000..738c90b
--- /dev/null
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -0,0 +1,148 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type. Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SILowerI1Copies() : MachineFunctionPass(ID) {
+ initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+ virtual const char *getPassName() const override {
+ return "SI Lower il Copies";
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
+ "SI Lower il Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
+ "SI Lower il Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+ return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ MF.getTarget().getInstrInfo());
+ const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+ std::vector<unsigned> I1Defs;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
+ I1Defs.push_back(MI.getOperand(0).getReg());
+ MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::V_AND_I1) {
+ I1Defs.push_back(MI.getOperand(0).getReg());
+ MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::V_OR_I1) {
+ I1Defs.push_back(MI.getOperand(0).getReg());
+ MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
+ continue;
+ }
+
+ if (MI.getOpcode() != AMDGPU::COPY ||
+ !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
+ !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+ continue;
+
+
+ const TargetRegisterClass *DstRC =
+ MRI.getRegClass(MI.getOperand(0).getReg());
+ const TargetRegisterClass *SrcRC =
+ MRI.getRegClass(MI.getOperand(1).getReg());
+
+ if (DstRC == &AMDGPU::VReg_1RegClass &&
+ TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+ I1Defs.push_back(MI.getOperand(0).getReg());
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addOperand(MI.getOperand(0))
+ .addImm(0)
+ .addImm(-1)
+ .addOperand(MI.getOperand(1))
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0);
+ MI.eraseFromParent();
+ } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+ SrcRC == &AMDGPU::VReg_1RegClass) {
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
+ .addOperand(MI.getOperand(0))
+ .addImm(0)
+ .addOperand(MI.getOperand(1))
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0);
+ MI.eraseFromParent();
+ }
+ }
+ }
+
+ for (unsigned Reg : I1Defs)
+ MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+
+ return false;
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ea04346..af60995 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,8 +10,11 @@
#include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
#define MAX_LANES 64
@@ -26,21 +29,57 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PSInputAddr(0),
SpillTracker() { }
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
- return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
+ unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+ // We need to add this register as live out for the function, in order to
+ // have the live range calculated directly.
+ //
+ // When register spilling begins, we have already calculated the live
+ // live intervals for all the registers. Since we are spilling SGPRs to
+ // VGPRs, we need to update the Lane VGPR's live interval every time we
+ // spill or restore a register.
+ //
+ // Unfortunately, there is no good way to update the live interval as
+ // the TargetInstrInfo callbacks for spilling and restoring don't give
+ // us access to the live interval information.
+ //
+ // We are lucky, though, because the InlineSpiller calls
+ // LiveRangeEdit::calculateRegClassAndHint() which iterates through
+ // all the new register that have been created when restoring a register
+ // and calls LiveIntervals::getInterval(), which creates and computes
+ // the live interval for the newly created register. However, once this
+ // live intervals is created, it doesn't change and since we usually reuse
+ // the Lane VGPR multiple times, this means any uses after the first aren't
+ // added to the live interval.
+ //
+ // To work around this, we add Lane VGPRs to the functions live out list,
+ // so that we can guarantee its live range will cover all of its uses.
+
+ for (MachineBasicBlock &MBB : *MF) {
+ if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+ MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
+ return VGPR;
+ }
+ }
+ MF->getFunction()->getContext().emitError(
+ "Could not found S_ENGPGM instrtuction.");
+ return VGPR;
}
-unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) {
+unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
+ MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
+ unsigned StartLane = CurrentLane;
+ CurrentLane += NumRegs;
if (!LaneVGPR) {
- LaneVGPR = createLaneVGPR(MRI);
+ LaneVGPR = createLaneVGPR(MRI, MF);
} else {
- CurrentLane++;
- if (CurrentLane == MAX_LANES) {
- CurrentLane = 0;
- LaneVGPR = createLaneVGPR(MRI);
+ if (CurrentLane >= MAX_LANES) {
+ StartLane = CurrentLane = 0;
+ LaneVGPR = createLaneVGPR(MRI, MF);
}
}
- return CurrentLane;
+ return StartLane;
}
void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 8dc82a0..96e619b 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -25,7 +25,7 @@ class MachineRegisterInfo;
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
- virtual void anchor();
+ void anchor() override;
public:
struct SpilledReg {
@@ -43,7 +43,12 @@ public:
public:
unsigned LaneVGPR;
RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
- unsigned getNextLane(MachineRegisterInfo &MRI);
+ /// \p NumRegs The number of consecutive registers what need to be spilled.
+ /// This function will ensure that all registers are stored in
+ /// the same VGPR.
+ /// \returns The lane to be used for storing the first register.
+ unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
+ unsigned NumRegs = 1);
void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
const SpilledReg& getSpilledReg(unsigned FrameIndex);
bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 6cef195..c72d549 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -71,13 +71,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
&AMDGPU::SReg_256RegClass
};
- for (unsigned i = 0, e = sizeof(BaseClasses) /
- sizeof(const TargetRegisterClass*); i != e; ++i) {
- if (BaseClasses[i]->contains(Reg)) {
- return BaseClasses[i];
+ for (const TargetRegisterClass *BaseClass : BaseClasses) {
+ if (BaseClass->contains(Reg)) {
+ return BaseClass;
}
}
- return NULL;
+ return nullptr;
}
bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
@@ -113,7 +112,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
} else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
return &AMDGPU::VReg_512RegClass;
}
- return NULL;
+ return nullptr;
}
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -129,3 +128,10 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return &AMDGPU::VGPR_32RegClass;
}
}
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+ const TargetRegisterClass *SubRC,
+ unsigned Channel) const {
+ unsigned Index = getHWRegIndex(Reg);
+ return SubRC->getRegister(Index + Channel);
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 8148f7f..36b4fcd 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -27,22 +27,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
SIRegisterInfo(AMDGPUTargetMachine &tm);
- virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
- virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const;
+ unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+ MachineFunction &MF) const override;
/// \param RC is an AMDIL reg class.
///
/// \returns the SI register class that is equivalent to \p RC.
- virtual const TargetRegisterClass *
- getISARegClass(const TargetRegisterClass *RC) const;
+ const TargetRegisterClass *
+ getISARegClass(const TargetRegisterClass *RC) const override;
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
- virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+ const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
- virtual unsigned getHWRegIndex(unsigned Reg) const;
+ unsigned getHWRegIndex(unsigned Reg) const override;
/// \brief Return the 'base' register class for this register.
/// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
@@ -63,6 +63,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
/// be returned.
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;
+
+ /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+ /// SubReg index.
+ /// \returns The sub-register of Reg that is in Channel.
+ unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+ unsigned Channel) const;
};
} // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 65cf311..f1f01de 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
(add SGPR_64Regs, VCCReg, EXECReg)
>;
-def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>;
def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
@@ -183,14 +183,16 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
let Size = 96;
}
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+
//===----------------------------------------------------------------------===//
-// [SV]Src_* register classes, can have either an immediate or an register
+// [SV]Src_(32|64) register classes, can have either an immediate or an register
//===----------------------------------------------------------------------===//
def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
@@ -201,3 +203,9 @@ def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+//===----------------------------------------------------------------------===//
+// SGPR and VGPR register classes
+//===----------------------------------------------------------------------===//
+
+def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128,
+ (add VReg_128, SReg_128)>;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 9bf2caf..a0b6907 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -35,13 +35,13 @@ class SITypeRewriter : public FunctionPass,
static char ID;
Module *Mod;
Type *v16i8;
- Type *i128;
+ Type *v4i32;
public:
SITypeRewriter() : FunctionPass(ID) { }
- virtual bool doInitialization(Module &M);
- virtual bool runOnFunction(Function &F);
- virtual const char *getPassName() const {
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ const char *getPassName() const override {
return "SI Type Rewriter";
}
void visitLoadInst(LoadInst &I);
@@ -56,7 +56,7 @@ char SITypeRewriter::ID = 0;
bool SITypeRewriter::doInitialization(Module &M) {
Mod = &M;
v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
- i128 = Type::getIntNTy(M.getContext(), 128);
+ v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
return false;
}
@@ -84,7 +84,8 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
Type *ElemTy = PtrTy->getPointerElementType();
IRBuilder<> Builder(&I);
if (ElemTy == v16i8) {
- Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+ Value *BitCast = Builder.CreateBitCast(Ptr,
+ PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
LoadInst *Load = Builder.CreateLoad(BitCast);
SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
I.getAllMetadataOtherThanDebugLoc(MD);
@@ -99,6 +100,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
void SITypeRewriter::visitCallInst(CallInst &I) {
IRBuilder<> Builder(&I);
+
SmallVector <Value*, 8> Args;
SmallVector <Type*, 8> Types;
bool NeedToReplace = false;
@@ -107,10 +109,10 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
Value *Arg = I.getArgOperand(i);
if (Arg->getType() == v16i8) {
- Args.push_back(Builder.CreateBitCast(Arg, i128));
- Types.push_back(i128);
+ Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+ Types.push_back(v4i32);
NeedToReplace = true;
- Name = Name + ".i128";
+ Name = Name + ".v4i32";
} else if (Arg->getType()->isVectorTy() &&
Arg->getType()->getVectorNumElements() == 1 &&
Arg->getType()->getVectorElementType() ==
@@ -144,12 +146,12 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
void SITypeRewriter::visitBitCast(BitCastInst &I) {
IRBuilder<> Builder(&I);
- if (I.getDestTy() != i128) {
+ if (I.getDestTy() != v4i32) {
return;
}
if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
- if (Op->getSrcTy() == i128) {
+ if (Op->getSrcTy() == v4i32) {
I.replaceAllUsesWith(Op->getOperand(0));
I.eraseFromParent();
}