diff options
author | Stephen Hines <srhines@google.com> | 2014-05-29 02:49:00 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-05-29 02:49:00 -0700 |
commit | dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch) | |
tree | dcebc53f2b182f145a2e659393bf9a0472cedf23 /lib/Target/R600 | |
parent | 220b921aed042f9e520c26cffd8282a94c66c3d5 (diff) | |
download | external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2 |
Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
Diffstat (limited to 'lib/Target/R600')
77 files changed, 3148 insertions, 1599 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 3e1848b..949fdfb 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -37,11 +37,15 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +void initializeSILowerI1CopiesPass(PassRegistry &); +extern char &SILowerI1CopiesID; + // Passes common to R600 and SI Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); @@ -76,8 +80,8 @@ enum AddressSpaces { GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). CONSTANT_ADDRESS = 2, ///< Address space for constant memory LOCAL_ADDRESS = 3, ///< Address space for local memory. - REGION_ADDRESS = 4, ///< Address space for region memory. - ADDRESS_NONE = 5, ///< Address space for unknown memory. + FLAT_ADDRESS = 4, ///< Address space for flat memory. + REGION_ADDRESS = 5, ///< Address space for region memory. PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) @@ -102,7 +106,8 @@ enum AddressSpaces { CONSTANT_BUFFER_13 = 21, CONSTANT_BUFFER_14 = 22, CONSTANT_BUFFER_15 = 23, - LAST_ADDRESS = 24 + ADDRESS_NONE = 24, ///< Address space for unknown memory. + LAST_ADDRESS = ADDRESS_NONE }; } // namespace AMDGPUAS diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index d1e2cf5..2edc115 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -120,6 +120,17 @@ def AMDGPU : Target { let InstructionSet = AMDGPUInstrInfo; } +//===----------------------------------------------------------------------===// +// Predicate helper class +//===----------------------------------------------------------------------===// + +class PredicateControl { + Predicate SubtargetPredicate; + list<Predicate> OtherPredicates = []; + list<Predicate> Predicates = !listconcat([SubtargetPredicate], + OtherPredicates); +} + // Include AMDGPU TD files include "R600Schedule.td" include "SISchedule.td" diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index b166c45..170f479 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -64,7 +64,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { - findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR); + getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); } else { EmitProgramInfoR600(MF); @@ -84,8 +84,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SectionKind::getReadOnly()); OutStreamer.SwitchSection(CommentSection); - if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { + if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { OutStreamer.emitRawComment(" Kernel info:", false); + OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), + false); OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), false); OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), @@ -184,9 +186,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { } } -void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF, - unsigned &NumSGPR, - unsigned &NumVGPR) const { +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + MachineFunction &MF) const { + uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; @@ -200,6 +202,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF, I != E; ++I) { MachineInstr &MI = *I; + // TODO: CodeSize should account for multiple functions. + CodeSize += MI.getDesc().Size; + unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { MachineOperand &MO = MI.getOperand(op_idx); @@ -274,13 +279,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF, if (VCCUsed) MaxSGPR += 2; - NumSGPR = MaxSGPR; - NumVGPR = MaxVGPR; -} - -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out, - MachineFunction &MF) const { - findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR); + ProgInfo.CodeLen = CodeSize; + ProgInfo.NumSGPR = MaxSGPR; + ProgInfo.NumVGPR = MaxVGPR; } void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index a2b8337..71adc9a 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -24,7 +24,12 @@ namespace llvm { class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { - SIProgramInfo() : NumSGPR(0), NumVGPR(0) {} + SIProgramInfo() : + CodeLen(0), + NumSGPR(0), + NumVGPR(0) {} + + uint64_t CodeLen; unsigned NumSGPR; unsigned NumVGPR; }; @@ -42,14 +47,14 @@ private: public: explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer); - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const { + const char *getPassName() const override { return "AMDGPU Assembly Printer"; } /// Implemented in AMDGPUMCInstLower.cpp - virtual void EmitInstruction(const MachineInstr *MI); + void EmitInstruction(const MachineInstr *MI) override; protected: bool DisasmEnabled; diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 65cdb24..5f8ad8c 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -20,7 +20,7 @@ def CC_SI : CallingConv<[ CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, - SGPR16 + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21 ]>>>, CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp index 50297d1..91aeee2 100644 --- a/lib/Target/R600/AMDGPUConvertToISA.cpp +++ b/lib/Target/R600/AMDGPUConvertToISA.cpp @@ -31,9 +31,9 @@ public: AMDGPUConvertToISAPass(TargetMachine &tm) : MachineFunctionPass(ID), TM(tm) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const {return "AMDGPU Convert to ISA";} + const char *getPassName() const override {return "AMDGPU Convert to ISA";} }; diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp index 0325a00..e7e90d3 100644 --- a/lib/Target/R600/AMDGPUFrameLowering.cpp +++ b/lib/Target/R600/AMDGPUFrameLowering.cpp @@ -97,7 +97,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, const TargetFrameLowering::SpillSlot * AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { NumEntries = 0; - return 0; + return nullptr; } void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h index cf5742e..d18ede5 100644 --- a/lib/Target/R600/AMDGPUFrameLowering.h +++ b/lib/Target/R600/AMDGPUFrameLowering.h @@ -33,12 +33,13 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. - virtual unsigned getStackWidth(const MachineFunction &MF) const; - virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; - virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const; - virtual void emitPrologue(MachineFunction &MF) const; - virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; - virtual bool hasFP(const MachineFunction &MF) const; + unsigned getStackWidth(const MachineFunction &MF) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + void emitPrologue(MachineFunction &MF) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + bool hasFP(const MachineFunction &MF) const override; }; } // namespace llvm #endif // AMDILFRAME_LOWERING_H diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index e8c5f5b..f1f0bfa 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -16,15 +16,11 @@ #include "AMDGPURegisterInfo.h" #include "R600InstrInfo.h" #include "SIISelLowering.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/ValueMap.h" -#include "llvm/Support/Compiler.h" -#include <list> -#include <queue> +#include "llvm/IR/Function.h" using namespace llvm; @@ -43,11 +39,12 @@ public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); - SDNode *Select(SDNode *N); - virtual const char *getPassName() const; - virtual void PostprocessISelDAG(); + SDNode *Select(SDNode *N) override; + const char *getPassName() const override; + void PostprocessISelDAG() override; private: + bool isInlineImmediate(SDNode *N) const; inline SDValue getSmallIPtrImm(unsigned Imm); bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); @@ -58,11 +55,9 @@ private: bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); - SDValue SimplifyI24(SDValue &Op); - bool SelectI24(SDValue Addr, SDValue &Op); - bool SelectU24(SDValue Addr, SDValue &Op); static bool checkType(const Value *ptr, unsigned int addrspace); + static bool checkPrivateAddress(const MachineMemOperand *Op); static bool isGlobalStore(const StoreSDNode *N); static bool isPrivateStore(const StoreSDNode *N); @@ -77,10 +72,15 @@ private: bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; + /// \returns True if the current basic block being selected is at control + /// flow depth 0. Meaning that the current block dominates the + // exit block. + bool isCFDepth0() const; + const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); - bool SelectGlobalValueVariableOffset(SDValue Addr, - SDValue &BaseReg, SDValue& Offset); + bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, + SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); @@ -91,8 +91,7 @@ private: /// \brief This pass converts a legalized DAG into a AMDGPU-specific // DAG, ready for instruction scheduling. -FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM - ) { +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { return new AMDGPUDAGToDAGISel(TM); } @@ -103,32 +102,39 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { } +bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { + const SITargetLowering *TL + = static_cast<const SITargetLowering *>(getTargetLowering()); + return TL->analyzeImmediate(N) == 0; +} + /// \brief Determine the register class for \p OpNo /// \returns The register class of the virtual register that will be used for /// the given operand number \OpNo or NULL if the register class cannot be /// determined. const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const { - if (!N->isMachineOpcode()) { - return NULL; - } + if (!N->isMachineOpcode()) + return nullptr; + switch (N->getMachineOpcode()) { default: { const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) - return NULL; + return nullptr; int RegClass = Desc.OpInfo[OpIdx].RegClass; - if (RegClass == -1) { - return NULL; - } + if (RegClass == -1) + return nullptr; + return TM.getRegisterInfo()->getRegClass(RegClass); } case AMDGPU::REG_SEQUENCE: { - const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass( - cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()); - unsigned SubRegIdx = - dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue(); + unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID); + + SDValue SubRegOp = N->getOperand(OpNo + 1); + unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx); } } @@ -139,7 +145,7 @@ SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { } bool AMDGPUDAGToDAGISel::SelectADDRParam( - SDValue Addr, SDValue& R1, SDValue& R2) { + SDValue Addr, SDValue& R1, SDValue& R2) { if (Addr.getOpcode() == ISD::FrameIndex) { if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { @@ -196,15 +202,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { N->setNodeId(-1); - return NULL; // Already selected. + return nullptr; // Already selected. } + + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); switch (Opc) { default: break; // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. case ISD::ADD: { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (N->getValueType(0) != MVT::i64 || ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; @@ -232,12 +239,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { AddLoArgs.push_back(SDValue(Lo0, 0)); AddLoArgs.push_back(SDValue(Lo1, 0)); - SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, - VTList, AddLoArgs); + SDNode *AddLo = CurDAG->getMachineNode( + isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32, + DL, VTList, AddLoArgs); SDValue Carry = SDValue(AddLo, 1); - SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL, - MVT::i32, SDValue(Hi0, 0), - SDValue(Hi1, 0), Carry); + SDNode *AddHi = CurDAG->getMachineNode( + isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32, + DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); SDValue Args[5] = { CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), @@ -246,11 +254,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue(AddHi,0), Sub1, }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5); + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } case ISD::BUILD_VECTOR: { unsigned RegClassID; - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo()); const SIRegisterInfo *SIRI = @@ -316,7 +323,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // 16 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SDValue RegSeqArgs[16 * 2 + 1]; + SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32); bool IsRegSeq = true; @@ -333,11 +340,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), - RegSeqArgs, 2 * N->getNumOperands() + 1); + RegSeqArgs); } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } @@ -346,7 +352,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32); } else if (N->getValueType(0) == MVT::i64) { - RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32); + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); } else { @@ -357,8 +363,37 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N), N->getValueType(0), Ops); } - case AMDGPUISD::REGISTER_LOAD: { + + case ISD::Constant: + case ISD::ConstantFP: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + break; + + uint64_t Imm; + if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) + Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); + else { + ConstantSDNode *C = cast<ConstantSDNode>(N); + Imm = C->getZExtValue(); + } + + SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32)); + SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, + CurDAG->getConstant(Imm >> 32, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N), + N->getValueType(0), Ops); + } + + case AMDGPUISD::REGISTER_LOAD: { if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; @@ -375,7 +410,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Ops); } case AMDGPUISD::REGISTER_STORE: { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; @@ -391,42 +425,95 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { CurDAG->getVTList(MVT::Other), Ops); } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + // There is a scalar version available, but unlike the vector version which + // has a separate operand for the offset and width, the scalar version packs + // the width and offset into a single operand. Try to move to the scalar + // version if the offsets are constant, so that we can try to keep extended + // loads of kernel arguments in SGPRs. + + // TODO: Technically we could try to pattern match scalar bitshifts of + // dynamic values, but it's probably not useful. + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Offset) + break; + + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); + if (!Width) + break; + + bool Signed = Opc == AMDGPUISD::BFE_I32; + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + + uint32_t OffsetVal = Offset->getZExtValue(); + uint32_t WidthVal = Width->getZExtValue(); + + uint32_t PackedVal = OffsetVal | WidthVal << 16; + + SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32); + return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, + SDLoc(N), + MVT::i32, + N->getOperand(0), + PackedOffsetWidth); + + } } return SelectCode(N); } -bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) { - if (!ptr) { +bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { + assert(AS != 0 && "Use checkPrivateAddress instead."); + if (!Ptr) return false; - } - Type *ptrType = ptr->getType(); - return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace; + + return Ptr->getType()->getPointerAddressSpace() == AS; +} + +bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { + if (Op->getPseudoValue()) + return true; + + if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType())) + return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + return false; } bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { - return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); } bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { - return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)); + const Value *MemVal = N->getMemOperand()->getValue(); + return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); } bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { - return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { - return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { - if (CbId == -1) { - return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS); - } - return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_BUFFER_0 + CbId); + const Value *MemVal = N->getMemOperand()->getValue(); + if (CbId == -1) + return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); + + return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); } bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { @@ -437,27 +524,26 @@ bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { return true; } } - return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); } bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { - return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); } bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { - return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { - return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { MachineMemOperand *MMO = N->getMemOperand(); - if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { + if (checkPrivateAddress(N->getMemOperand())) { if (MMO) { - const Value *V = MMO->getValue(); - const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V); + const PseudoSourceValue *PSV = MMO->getPseudoValue(); if (PSV && PSV == PseudoSourceValue::getConstantPool()) { return true; } @@ -467,24 +553,34 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { } bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { - if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { + if (checkPrivateAddress(N->getMemOperand())) { // Check to make sure we are not a constant pool load or a constant load // that is marked as a private load if (isCPLoad(N) || isConstantLoad(N, -1)) { return false; } } - if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS) - && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) { + + const Value *MemVal = N->getMemOperand()->getValue(); + if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && + !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){ return true; } return false; } +bool AMDGPUDAGToDAGISel::isCFDepth0() const { + // FIXME: Figure out a way to use DominatorTree analysis here. + const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock(); + const Function *Fn = FuncInfo->Fn; + return &Fn->front() == CurBlock || &Fn->back() == CurBlock; +} + + const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -499,7 +595,7 @@ const char *AMDGPUDAGToDAGISel::getPassName() const { //===----------------------------------------------------------------------===// bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, - SDValue& IntPtr) { + SDValue& IntPtr) { if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true); return true; @@ -509,7 +605,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, SDValue& BaseReg, SDValue &Offset) { - if (!dyn_cast<ConstantSDNode>(Addr)) { + if (!isa<ConstantSDNode>(Addr)) { BaseReg = Addr; Offset = CurDAG->getIntPtrConstant(0, true); return true; @@ -519,7 +615,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset) { - ConstantSDNode * IMMOffset; + ConstantSDNode *IMMOffset; if (Addr.getOpcode() == ISD::ADD && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) @@ -563,52 +659,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } -SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) { - APInt Demanded = APInt(32, 0x00FFFFFF); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true); - const TargetLowering *TLI = getTargetLowering(); - if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) { - CurDAG->ReplaceAllUsesWith(Op, TLO.New); - CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode()); - return SimplifyI24(TLO.New); - } else { - return Op; - } -} - -bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) { - - assert(Op.getValueType() == MVT::i32); - - if (CurDAG->ComputeNumSignBits(Op) == 9) { - I24 = SimplifyI24(Op); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) { - APInt KnownZero; - APInt KnownOne; - CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne); - - assert (Op.getValueType() == MVT::i32); - - // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than - // i32. These smaller types are legal to use with the i24 instructions. - if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 || - Op.getOpcode() == ISD::ANY_EXTEND || - ISD::isEXTLoad(Op.getNode())) { - U24 = SimplifyI24(Op); - return true; - } - return false; -} - void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = - (*(const AMDGPUTargetLowering*)getTargetLowering()); + *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); bool IsModified = false; do { IsModified = false; diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 183725c..6c443ea 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -28,8 +28,50 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" using namespace llvm; + +namespace { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} + + static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -88,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::f64, Promote); AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + setOperationAction(ISD::STORE, MVT::v2f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); + // Custom lowering of vector stores is required for local address space // stores. setOperationAction(ISD::STORE, MVT::v4i32, Custom); @@ -103,6 +148,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : // handle 64-bit stores. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); @@ -126,6 +173,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::f64, Promote); AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::LOAD, MVT::v2f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); @@ -152,15 +202,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); setOperationAction(ISD::FNEG, MVT::v4f32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::SUB, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UDIVREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); setOperationAction(ISD::VSELECT, MVT::v4f32, Expand); @@ -168,10 +222,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : static const MVT::SimpleValueType IntTypes[] = { MVT::v2i32, MVT::v4i32 }; - const size_t NumIntTypes = array_lengthof(IntTypes); - for (unsigned int x = 0; x < NumIntTypes; ++x) { - MVT::SimpleValueType VT = IntTypes[x]; + for (MVT VT : IntTypes) { //Expand the following operations for the current type by default setOperationAction(ISD::ADD, VT, Expand); setOperationAction(ISD::AND, VT, Expand); @@ -195,12 +247,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : static const MVT::SimpleValueType FloatTypes[] = { MVT::v2f32, MVT::v4f32 }; - const size_t NumFloatTypes = array_lengthof(FloatTypes); - for (unsigned int x = 0; x < NumFloatTypes; ++x) { - MVT::SimpleValueType VT = FloatTypes[x]; + for (MVT VT : FloatTypes) { setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); @@ -208,25 +259,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); } - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT_CC); } //===----------------------------------------------------------------------===// @@ -325,6 +364,25 @@ SDValue AMDGPUTargetLowering::LowerReturn( // Target specific lowering //===---------------------------------------------------------------------===// +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SDValue Callee = CLI.Callee; + SelectionDAG &DAG = CLI.DAG; + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + StringRef FuncName("<unknown>"); + + if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) + FuncName = G->getSymbol(); + else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + FuncName = G->getGlobal()->getName(); + + DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DAG.getContext()->diagnose(NoCalls); + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -361,12 +419,111 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; + case ISD::UDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM); + break; + } + case ISD::UREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, HalfVT); + SDValue zero = DAG.getConstant(0, HalfVT); + + //HiLo split + SDValue LHS = N->getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = N->getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Hi = zero; + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); + // Get Value of high bit + SDValue HBit; + if (halfBitWidth == 32 && Subtarget->hasBFE()) { + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); + } else { + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + } + + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, + DAG.getConstant(halfBitWidth - 1, HalfVT)); + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); + + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); + + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); + } + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); + break; + } default: return; } } +// FIXME: This implements accesses to initialized globals in the constant +// address space by copying them to private and accessing that. It does not +// properly handle illegal types or vectors. The private vector loads are not +// scalarized, and the illegal scalars hit an assertion. This technique will not +// work well with large initializers, and this should eventually be +// removed. Initialized globals should be placed into a data section that the +// runtime will load into a buffer before the kernel is executed. Uses of the +// global need to be replaced with a pointer loaded from an implicit kernel +// argument into this buffer holding the copy of the data, which will remove the +// need for any of this. SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, const SDValue &InitPtr, @@ -380,29 +537,60 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, MachinePointerInfo(UndefValue::get(PtrTy)), false, false, TD->getPrefTypeAlignment(CI->getType())); - } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { + } + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { EVT VT = EVT::getEVT(CFP->getType()); PointerType *PtrTy = PointerType::get(CFP->getType(), 0); return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, MachinePointerInfo(UndefValue::get(PtrTy)), false, false, TD->getPrefTypeAlignment(CFP->getType())); - } else if (Init->getType()->isAggregateType()) { + } + + Type *InitTy = Init->getType(); + if (StructType *ST = dyn_cast<StructType>(InitTy)) { + const StructLayout *SL = TD->getStructLayout(ST); + EVT PtrVT = InitPtr.getValueType(); - unsigned NumElements = Init->getType()->getArrayNumElements(); + SmallVector<SDValue, 8> Chains; + + for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { + SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(I); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { + EVT PtrVT = InitPtr.getValueType(); + + unsigned NumElements; + if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) + NumElements = AT->getNumElements(); + else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) + NumElements = VT->getNumElements(); + else + llvm_unreachable("Unexpected type"); + + unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); SmallVector<SDValue, 8> Chains; for (unsigned i = 0; i < NumElements; ++i) { - SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize( - Init->getType()->getArrayElementType()), PtrVT); + SDValue Offset = DAG.getConstant(i * EltSize, PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i), - GV, Ptr, Chain, DAG)); + + Constant *Elt = Init->getAggregateElement(i); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0], - Chains.size()); - } else { - Init->dump(); - llvm_unreachable("Unhandled constant initializer"); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } + + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); } SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, @@ -440,7 +628,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Size = TD->getTypeAllocSize(EltType); unsigned Alignment = TD->getPrefTypeAlignment(EltType); - const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV); + const GlobalVariable *Var = cast<GlobalVariable>(GV); const Constant *Init = Var->getInitializer(); int FI = FrameInfo->CreateStackObject(Size, Alignment, false); SDValue InitPtr = DAG.getFrameIndex(FI, @@ -461,7 +649,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { Ops.push_back((*I)->getOperand(i)); } - DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size()); + DAG.UpdateNodeOperands(*I, Ops); } return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); @@ -469,44 +657,28 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } } -void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &Args, - unsigned Start, - unsigned Count) const { - EVT VT = Op.getValueType(); - for (unsigned i = Start, e = Start + Count; i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), - VT.getVectorElementType(), - Op, DAG.getConstant(i, MVT::i32))); - } -} - SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> Args; SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - ExtractVectorElements(A, DAG, Args, 0, - A.getValueType().getVectorNumElements()); - ExtractVectorElements(B, DAG, Args, 0, - B.getValueType().getVectorNumElements()); + DAG.ExtractVectorElements(A, Args); + DAG.ExtractVectorElements(B, Args); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), - &Args[0], Args.size()); + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); } SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> Args; - EVT VT = Op.getValueType(); unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - ExtractVectorElements(Op.getOperand(0), DAG, Args, Start, - VT.getVectorNumElements()); + EVT VT = Op.getValueType(); + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, + VT.getVectorNumElements()); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), - &Args[0], Args.size()); + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); } SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, @@ -560,6 +732,22 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umul24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imul24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umad24: + return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_imad24: + return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case AMDGPUIntrinsic::AMDGPU_bfe_i32: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), @@ -590,8 +778,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, ///IABS(a) = SMAX(sub(0, a), a) SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, - SelectionDAG &DAG) const { - + SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), @@ -603,7 +790,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, /// Linear Interpolation /// LRP(a, b, c) = muladd(a, b, (1 - a) * c) SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, @@ -617,16 +804,16 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, } /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); +SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, + SelectionDAG &DAG) const { + SDLoc DL(N); + EVT VT = N->getValueType(0); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + SDValue CC = N->getOperand(4); if (VT != MVT::f32 || !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { @@ -654,10 +841,8 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, case ISD::SETOLT: case ISD::SETLE: case ISD::SETLT: { - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); - else - return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); + unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETGT: case ISD::SETGE: @@ -665,15 +850,13 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, case ISD::SETOGE: case ISD::SETUGT: case ISD::SETOGT: { - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); - else - return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); + unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETCC_INVALID: llvm_unreachable("Invalid setcc condcode!"); } - return Op; + return SDValue(); } SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, @@ -695,8 +878,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, MemEltVT, Load->isVolatile(), Load->isNonTemporal(), Load->getAlignment())); } - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), - Loads.data(), Loads.size()); + return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads); } SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, @@ -713,32 +895,46 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, } SDLoc DL(Op); - const SDValue &Value = Store->getValue(); + SDValue Value = Store->getValue(); EVT VT = Value.getValueType(); - const SDValue &Ptr = Store->getBasePtr(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Ptr = Store->getBasePtr(); EVT MemEltVT = MemVT.getVectorElementType(); unsigned MemEltBits = MemEltVT.getSizeInBits(); unsigned MemNumElements = MemVT.getVectorNumElements(); - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); - SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT); + unsigned PackedSize = MemVT.getStoreSizeInBits(); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); + + assert(Value.getValueType().getScalarSizeInBits() >= 32); SDValue PackedValue; for (unsigned i = 0; i < MemNumElements; ++i) { - EVT ElemVT = VT.getVectorElementType(); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, DAG.getConstant(i, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); - Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); - SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); - Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); + Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); + Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg + + SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); + Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); + if (i == 0) { PackedValue = Elt; } else { - PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); + PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); } } + + if (PackedSize < 32) { + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); + return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + PackedVT, + Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + } + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - MachinePointerInfo(Store->getMemOperand()->getValue()), + Store->getMemOperand()->getPointerInfo(), Store->isVolatile(), Store->isNonTemporal(), Store->getAlignment()); } @@ -766,7 +962,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, MemEltVT, Store->isVolatile(), Store->isNonTemporal(), Store->getAlignment())); } - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts); + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); } SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { @@ -788,9 +984,24 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); } + if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { + assert(VT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. + + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); + } + // Lower loads constant address space global variable loads if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) { + isa<GlobalVariable>( + GetUnderlyingObject(Load->getMemOperand()->getValue()))) { SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); @@ -887,15 +1098,13 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Num = Op.getOperand(0); SDValue Den = Op.getOperand(1); - SmallVector<SDValue, 8> Results; - // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); @@ -985,10 +1194,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2]; - Ops[0] = Div; - Ops[1] = Rem; - return DAG.getMergeValues(Ops, 2, DL); + SDValue Ops[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Ops, DL); } SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -1029,81 +1239,197 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); - unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits(); - unsigned DestBits = ScalarVT.getSizeInBits(); - unsigned BitsDiff = DestBits - SrcBits; - - if (!Subtarget->hasBFE()) - return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); + if (!VT.isVector()) + return SDValue(); SDValue Src = Op.getOperand(0); - if (VT.isVector()) { - SDLoc DL(Op); - // Need to scalarize this, and revisit each of the scalars later. - // TODO: Don't scalarize on Evergreen? - unsigned NElts = VT.getVectorNumElements(); - SmallVector<SDValue, 8> Args; - ExtractVectorElements(Src, DAG, Args, 0, NElts); + SDLoc DL(Op); - SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); - for (unsigned I = 0; I < NElts; ++I) - Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector<SDValue, 8> Args; + DAG.ExtractVectorElements(Src, Args, 0, NElts); - return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size()); - } + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); - if (SrcBits == 32) { - SDLoc DL(Op); + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); +} - // If the source is 32-bits, this is really half of a 2-register pair, and - // we need to discard the unused half of the pair. - SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc); - } +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// - unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1; +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.computeKnownBits(Op, KnownZero, KnownOne); - // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it - // might not be worth the effort, and will need to expand to shifts when - // fixing SGPR copies. - if (SrcBits < 32 && DestBits <= 32) { - SDLoc DL(Op); - MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); - - if (DestBits != 32) - Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src); - - // FIXME: This should use TargetConstant, but that hits assertions for - // Evergreen. - SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT, - Op.getOperand(0), // Operand - DAG.getConstant(0, ExtVT), // Offset - DAG.getConstant(SrcBits, ExtVT)); // Width - - // Truncate to the original type if necessary. - if (ScalarVT == MVT::i32) - return Ext; - return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext); - } + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} - // For small types, extend to 32-bits first. - if (SrcBits < 32) { - SDLoc DL(Op); - MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); - SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src); - SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32, - DL, - ExtVT, - TruncSrc, // Operand - DAG.getConstant(0, ExtVT), // Offset - DAG.getConstant(SrcBits, ExtVT)); // Width + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32); +template <typename IntTy> +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, + uint32_t Offset, uint32_t Width) { + if (Width + Offset < 32) { + IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); + return DAG.getConstant(Result, MVT::i32); } - // For everything else, use the standard bitshift expansion. - return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); + return DAG.getConstant(Src0 >> Offset, MVT::i32); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: break; + case ISD::MUL: { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + // FIXME: Add support for 24-bit multiply with 64-bit output on SI. + if (VT.isVector() || VT.getSizeInBits() > 32) + break; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + break; + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); + + return Reg; + } + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + case ISD::SELECT_CC: { + return CombineMinMax(N, DAG); + } + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + assert(!N->getValueType(0).isVector() && + "Vector handling of BFE not implemented"); + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); + if (!Width) + break; + + uint32_t WidthVal = Width->getZExtValue() & 0x1f; + if (WidthVal == 0) + return DAG.getConstant(0, MVT::i32); + + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Offset) + break; + + SDValue BitsFrom = N->getOperand(0); + uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; + + bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; + + if (OffsetVal == 0) { + // This is already sign / zero extended, so try to fold away extra BFEs. + unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); + + unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); + if (OpSignBits >= SignBits) + return BitsFrom; + + EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); + if (Signed) { + // This is a sign_extend_inreg. Replace it to take advantage of existing + // DAG Combines. If not eliminated, we will match back to BFE during + // selection. + + // TODO: The sext_inreg of extended types ends, although we can could + // handle them in a single BFE. + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, + DAG.getValueType(SmallVT)); + } + + return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); + } + + if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (Signed) { + return constantFoldBFE<int32_t>(DAG, + Val->getSExtValue(), + OffsetVal, + WidthVal); + } + + return constantFoldBFE<uint32_t>(DAG, + Val->getZExtValue(), + OffsetVal, + WidthVal); + } + + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + + if ((OffsetVal + WidthVal) >= 32) { + SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); + return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, + BitsFrom, ShiftVal); + } + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } + } + return SDValue(); } //===----------------------------------------------------------------------===// @@ -1181,7 +1507,7 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { - default: return 0; + default: return nullptr; // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); @@ -1202,6 +1528,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) + NODE_NAME_CASE(MAD_U24) + NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) @@ -1219,22 +1549,22 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -static void computeMaskedBitsForMinMax(const SDValue Op0, - const SDValue Op1, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) { +static void computeKnownBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { APInt Op0Zero, Op0One; APInt Op1Zero, Op1One; - DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth); - DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth); + DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); + DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); KnownZero = Op0Zero & Op1Zero; KnownOne = Op0One & Op1One; } -void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( +void AMDGPUTargetLowering::computeKnownBitsForTargetNode( const SDValue Op, APInt &KnownZero, APInt &KnownOne, @@ -1242,8 +1572,14 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( unsigned Depth) const { KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + + APInt KnownZero2; + APInt KnownOne2; unsigned Opc = Op.getOpcode(); + switch (Opc) { + default: + break; case ISD::INTRINSIC_WO_CHAIN: { // FIXME: The intrinsic should just use the node. switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { @@ -1251,8 +1587,8 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( case AMDGPUIntrinsic::AMDGPU_umax: case AMDGPUIntrinsic::AMDGPU_imin: case AMDGPUIntrinsic::AMDGPU_umin: - computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2), - KnownZero, KnownOne, DAG, Depth); + computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); break; default: break; @@ -1264,10 +1600,62 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( case AMDGPUISD::UMAX: case AMDGPUISD::SMIN: case AMDGPUISD::UMIN: - computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1), - KnownZero, KnownOne, DAG, Depth); + computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), + KnownZero, KnownOne, DAG, Depth); break; - default: + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!CWidth) + return; + + unsigned BitWidth = 32; + uint32_t Width = CWidth->getZExtValue() & 0x1f; + if (Width == 0) { + KnownZero = APInt::getAllOnesValue(BitWidth); + KnownOne = APInt::getNullValue(BitWidth); + return; + } + + // FIXME: This could do a lot more. If offset is 0, should be the same as + // sign_extend_inreg implementation, but that involves duplicating it. + if (Opc == AMDGPUISD::BFE_I32) + KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + else + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + break; } + } +} + +unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &DAG, + unsigned Depth) const { + switch (Op.getOpcode()) { + case AMDGPUISD::BFE_I32: { + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!Width) + return 1; + + unsigned SignBits = 32 - Width->getZExtValue() + 1; + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Offset || !Offset->isNullValue()) + return SignBits; + + // TODO: Could probably figure something out with non-0 offsets. + unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + return std::max(SignBits, Op0SignBits); + } + + case AMDGPUISD::BFE_U32: { + ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; + } + + default: + return 1; + } } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index a019616..d5d821d 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -29,9 +29,6 @@ protected: const AMDGPUSubtarget *Subtarget; private: - void ExtractVectorElements(SDValue Op, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &Args, - unsigned Start, unsigned Count) const; SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, const SDValue &InitPtr, SDValue Chain, @@ -44,7 +41,7 @@ private: /// of the same bitwidth. SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; /// \brief Split a vector store into multiple scalar stores. - /// \returns The resulting chain. + /// \returns The resulting chain. SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -83,62 +80,67 @@ protected: public: AMDGPUTargetLowering(TargetMachine &TM); - virtual bool isFAbsFree(EVT VT) const override; - virtual bool isFNegFree(EVT VT) const override; - virtual bool isTruncateFree(EVT Src, EVT Dest) const override; - virtual bool isTruncateFree(Type *Src, Type *Dest) const override; - - virtual bool isZExtFree(Type *Src, Type *Dest) const override; - virtual bool isZExtFree(EVT Src, EVT Dest) const override; - - virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - - virtual MVT getVectorIdxTy() const override; - virtual bool isLoadBitCastBeneficial(EVT, EVT) const override; - virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const; - virtual SDValue LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl<SDValue> &InVals) const { - CLI.Callee.dump(); - llvm_unreachable("Undefined function"); - } + bool isFAbsFree(EVT VT) const override; + bool isFNegFree(EVT VT) const override; + bool isTruncateFree(EVT Src, EVT Dest) const override; + bool isTruncateFree(Type *Src, Type *Dest) const override; + + bool isZExtFree(Type *Src, Type *Dest) const override; + bool isZExtFree(EVT Src, EVT Dest) const override; + + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; - virtual void ReplaceNodeResults(SDNode * N, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) const override; + MVT getVectorIdxTy() const override; + bool isLoadBitCastBeneficial(EVT, EVT) const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const; - virtual const char* getTargetNodeName(unsigned Opcode) const; + SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const; + const char* getTargetNodeName(unsigned Opcode) const override; - virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const { + virtual SDNode *PostISelFolding(MachineSDNode *N, + SelectionDAG &DAG) const { return N; } /// \brief Determine which of the bits specified in \p Mask are known to be /// either zero or one and return them in the \p KnownZero and \p KnownOne /// bitsets. - virtual void computeMaskedBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + virtual unsigned ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; // Functions defined in AMDILISelLowering.cpp public: - virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, unsigned Intrinsic) const; + bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const override; /// We want to mark f32/f64 floating point values as legal. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; /// We don't want to shrink f64/f32 constants. - bool ShouldShrinkFPConstant(EVT VT) const; + bool ShouldShrinkFPConstant(EVT VT) const override; + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; private: void InitAMDILLowering(); @@ -158,7 +160,6 @@ private: SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; }; namespace AMDGPUISD { @@ -188,6 +189,10 @@ enum { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. + MUL_U24, + MUL_I24, + MAD_U24, + MAD_I24, TEXTURE_FETCH, EXPORT, CONST_ADDRESS, diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index e32dd9f..1c3361a 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -20,14 +20,13 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +using namespace llvm; + #define GET_INSTRINFO_CTOR_DTOR #define GET_INSTRINFO_NAMED_OPS #define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" -using namespace llvm; - - // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} @@ -85,7 +84,7 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const { // TODO: Implement this function - return NULL; + return nullptr; } bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, MachineBasicBlock &MBB) const { @@ -176,7 +175,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, const SmallVectorImpl<unsigned> &Ops, int FrameIndex) const { // TODO: Implement this function - return 0; + return nullptr; } MachineInstr* AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, @@ -184,7 +183,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, const SmallVectorImpl<unsigned> &Ops, MachineInstr *LoadMI) const { // TODO: Implement this function - return 0; + return nullptr; } bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, @@ -356,3 +355,14 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); } } + +// Wrapper for Tablegen'd function. enum Subtarget is not defined in any +// header files, so we need to wrap it in a function that takes unsigned +// instead. +namespace llvm { +namespace AMDGPU { +int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcode(Opcode); +} +} +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index 426910c..74baf6b 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -52,14 +52,15 @@ public: virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const; + unsigned &DstReg, unsigned &SubIdx) const override; - unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const; + int &FrameIndex) const override; bool hasLoadFromStackSlot(const MachineInstr *MI, const MachineMemOperand *&MMO, - int &FrameIndex) const; + int &FrameIndex) const override; unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, int &FrameIndex) const; @@ -70,7 +71,7 @@ public: MachineInstr * convertToThreeAddress(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const; + LiveVariables *LV) const override; virtual void copyPhysReg(MachineBasicBlock &MBB, @@ -78,61 +79,62 @@ public: unsigned DestReg, unsigned SrcReg, bool KillSrc) const = 0; - virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - virtual void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; - virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; protected: MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const; + int FrameIndex) const override; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, - MachineInstr *LoadMI) const; + MachineInstr *LoadMI) const override; /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. - virtual int getIndirectIndexBegin(const MachineFunction &MF) const; + int getIndirectIndexBegin(const MachineFunction &MF) const; /// \returns the largest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. - virtual int getIndirectIndexEnd(const MachineFunction &MF) const; + int getIndirectIndexEnd(const MachineFunction &MF) const; public: bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const; + const SmallVectorImpl<unsigned> &Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, bool UnfoldStore, - SmallVectorImpl<MachineInstr *> &NewMIs) const; + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr *> &NewMIs) const override; bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl<SDNode *> &NewNodes) const; + SmallVectorImpl<SDNode *> &NewNodes) const override; unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = 0) const; + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = nullptr) const override; bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const; + unsigned NumLoads) const override; - bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; void insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const; - bool isPredicated(const MachineInstr *MI) const; + MachineBasicBlock::iterator MI) const override; + bool isPredicated(const MachineInstr *MI) const override; bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const; + const SmallVectorImpl<MachineOperand> &Pred2) const override; bool DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const; - bool isPredicable(MachineInstr *MI) const; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + std::vector<MachineOperand> &Pred) const override; + bool isPredicable(MachineInstr *MI) const override; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; // Helper functions that check the opcode for status information bool isLoadInst(llvm::MachineInstr *MI) const; @@ -186,8 +188,7 @@ public: /// \brief Convert the AMDIL MachineInstr to a supported ISA /// MachineInstr - virtual void convertToISA(MachineInstr & MI, MachineFunction &MF, - DebugLoc DL) const; + void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const; /// \brief Build a MOV instruction. virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 69d8059..f96dbb4 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -92,3 +92,18 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when +// performing the mulitply. The result is a 32-bit value. +def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, + [SDNPCommutative] +>; +def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, + [SDNPCommutative] +>; + +def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, + [] +>; +def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, + [] +>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 505fc81..80bdf5b 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -37,6 +37,18 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; +def u32imm : Operand<i32> { + let PrintMethod = "printU32ImmOperand"; +} + +def u16imm : Operand<i16> { + let PrintMethod = "printU16ImmOperand"; +} + +def u8imm : Operand<i8> { + let PrintMethod = "printU8ImmOperand"; +} + //===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// @@ -253,9 +265,6 @@ def FP_ONE : PatLeaf < [{return N->isExactlyValue(1.0);}] >; -def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>; -def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>; - let isCodeGenOnly = 1, isPseudo = 1 in { let usesCustomInserter = 1 in { @@ -414,6 +423,40 @@ class UMUL24Pattern <Instruction UMUL24> : Pat < >; */ +class IMad24Pat<Instruction Inst> : Pat < + (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +class UMad24Pat<Instruction Inst> : Pat < + (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> { + def _expand_imad24 : Pat < + (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_imul24 : Pat < + (AMDGPUmul_i24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> { + def _expand_umad24 : Pat < + (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_umul24 : Pat < + (AMDGPUmul_u24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index c6521d0..9ad5e72 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -49,6 +49,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index 2c9909f..b759495 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -17,6 +17,7 @@ #include "AMDGPUAsmPrinter.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "R600InstrInfo.h" +#include "SIInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/Constants.h" @@ -31,16 +32,30 @@ using namespace llvm; -AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx): - Ctx(ctx) +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): + Ctx(ctx), ST(st) { } +enum AMDGPUMCInstLower::SISubtarget +AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const { + return AMDGPUMCInstLower::SI; +} + +unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const { + + int MCOpcode = AMDGPU::getMCOpcode(MIOpcode, + AMDGPUSubtargetToSISubtarget(ST.getGeneration())); + if (MCOpcode == -1) + MCOpcode = MIOpcode; + + return MCOpcode; +} + void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { - OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + OutMI.setOpcode(getMCOpcode(MI->getOpcode())); + for (const MachineOperand &MO : MI->explicit_operands()) { MCOperand MCOp; switch (MO.getType()) { default: @@ -67,7 +82,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { - AMDGPUMCInstLower MCInstLowering(OutContext); + AMDGPUMCInstLower MCInstLowering(OutContext, + MF->getTarget().getSubtarget<AMDGPUSubtarget>()); #ifdef _DEBUG StringRef Err; diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index d7d538e..2b7f1e3 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -13,16 +13,30 @@ namespace llvm { +class AMDGPUSubtarget; class MCInst; class MCContext; class MachineInstr; class AMDGPUMCInstLower { + // This must be kept in sync with the SISubtarget class in SIInstrInfo.td + enum SISubtarget { + SI = 0 + }; + MCContext &Ctx; + const AMDGPUSubtarget &ST; + + /// Convert a member of the AMDGPUSubtarget::Generation enum to the + /// SISubtarget enum. + enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const; + + /// Get the MC opcode for this MachineInstr. + unsigned getMCOpcode(unsigned MIOpcode) const; public: - AMDGPUMCInstLower(MCContext &ctx); + AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); /// \brief Lower a MachineInstr to an MCInst void lower(const MachineInstr *MI, MCInst &OutMI) const; diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 8fbec4e..19927fa 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -27,10 +27,10 @@ AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm) // they are not supported at this time. //===----------------------------------------------------------------------===// -const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; +const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; -const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) - const { +const MCPhysReg* +AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return &CalleeSavedReg; } @@ -54,7 +54,7 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { AMDGPU::sub15 }; - assert (Channel < array_lengthof(SubRegs)); + assert(Channel < array_lengthof(SubRegs)); return SubRegs[Channel]; } diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 688e1a0..a7cba0d 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -30,11 +30,11 @@ class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { TargetMachine &TM; - static const uint16_t CalleeSavedReg; + static const MCPhysReg CalleeSavedReg; AMDGPURegisterInfo(TargetMachine &tm); - virtual BitVector getReservedRegs(const MachineFunction &MF) const { + BitVector getReservedRegs(const MachineFunction &MF) const override { assert(!"Unimplemented"); return BitVector(); } @@ -43,11 +43,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// \returns The ISA reg class that is equivalent to \p RC. virtual const TargetRegisterClass * getISARegClass( const TargetRegisterClass * RC) const { - assert(!"Unimplemented"); return NULL; + assert(!"Unimplemented"); return nullptr; } virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { - assert(!"Unimplemented"); return NULL; + assert(!"Unimplemented"); return nullptr; } virtual unsigned getHWRegIndex(unsigned Reg) const { @@ -58,11 +58,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) unsigned getSubRegFromChannel(unsigned Channel) const; - const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const; + const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const; - unsigned getFrameRegister(const MachineFunction &MF) const; + RegScavenger *RS) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; unsigned getIndirectSubReg(unsigned IndirectIndex) const; diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index e77ab5e..f3b9932 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -16,6 +16,8 @@ using namespace llvm; +#define DEBUG_TYPE "amdgpu-subtarget" + #define GET_SUBTARGETINFO_ENUM #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR @@ -28,9 +30,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : // Default card StringRef GPU = CPU; Is64bit = false; - DefaultSize[0] = 64; - DefaultSize[1] = 1; - DefaultSize[2] = 1; HasVertexCache = false; TexVTXClauseSize = 0; Gen = AMDGPUSubtarget::R600; @@ -106,14 +105,6 @@ bool AMDGPUSubtarget::isTargetELF() const { return false; } -size_t -AMDGPUSubtarget::getDefaultSize(uint32_t dim) const { - if (dim > 2) { - return 1; - } else { - return DefaultSize[dim]; - } -} std::string AMDGPUSubtarget::getDeviceName() const { diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 8874d14..1b041d6 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -38,7 +38,6 @@ public: }; private: - size_t DefaultSize[3]; std::string DevName; bool Is64bit; bool Is32on64bit; @@ -60,7 +59,7 @@ public: AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } - virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); bool is64bit() const; bool hasVertexCache() const; @@ -77,20 +76,28 @@ public: return hasBFE(); } + bool hasMulU24() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasMulI24() const { + return (getGeneration() >= SOUTHERN_ISLANDS || + hasCaymanISA()); + } + bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; unsigned getStackEntrySize() const; bool hasCFAluBug() const; - virtual bool enableMachineScheduler() const { + bool enableMachineScheduler() const override { return getGeneration() <= NORTHERN_ISLANDS; } // Helper functions to simplify if statements bool isTargetELF() const; std::string getDeviceName() const; - virtual size_t getDefaultSize(uint32_t dim) const; bool dumpCode() const { return DumpCode; } bool r600ALUEncoding() const { return R600ALUInst; } diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index b11fce3..174fdca 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -42,7 +42,7 @@ extern "C" void LLVMInitializeR600Target() { } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, new R600SchedStrategy()); + return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); } static MachineSchedRegistry @@ -54,7 +54,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) { if (ST.is64bit()) { // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64"; + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; } Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" @@ -103,20 +103,20 @@ public: return getTM<AMDGPUTargetMachine>(); } - virtual ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const { + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) return createR600MachineScheduler(C); - return 0; + return nullptr; } - virtual bool addPreISel(); - virtual bool addInstSelector(); - virtual bool addPreRegAlloc(); - virtual bool addPostRegAlloc(); - virtual bool addPreSched2(); - virtual bool addPreEmitPass(); + bool addPreISel() override; + bool addInstSelector() override; + bool addPreRegAlloc() override; + bool addPostRegAlloc() override; + bool addPreSched2() override; + bool addPreEmitPass() override; }; } // End of anonymous namespace @@ -154,6 +154,7 @@ AMDGPUPassConfig::addPreISel() { bool AMDGPUPassConfig::addInstSelector() { addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + addPass(createSILowerI1CopiesPass()); return false; } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index f942614..1287e13 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -20,7 +20,6 @@ #include "AMDGPUSubtarget.h" #include "AMDILIntrinsicInfo.h" #include "R600ISelLowering.h" -#include "llvm/ADT/OwningPtr.h" #include "llvm/IR/DataLayout.h" namespace llvm { @@ -31,8 +30,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { const DataLayout Layout; AMDGPUFrameLowering FrameLowering; AMDGPUIntrinsicInfo IntrinsicInfo; - OwningPtr<AMDGPUInstrInfo> InstrInfo; - OwningPtr<AMDGPUTargetLowering> TLInfo; + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; + std::unique_ptr<AMDGPUTargetLowering> TLInfo; const InstrItineraryData *InstrItins; public: @@ -40,30 +39,32 @@ public: StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - virtual const AMDGPUFrameLowering *getFrameLowering() const { + const AMDGPUFrameLowering *getFrameLowering() const override { return &FrameLowering; } - virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const { + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } - virtual const AMDGPUInstrInfo *getInstrInfo() const { + const AMDGPUInstrInfo *getInstrInfo() const override { return InstrInfo.get(); } - virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } - virtual const AMDGPURegisterInfo *getRegisterInfo() const { + const AMDGPUSubtarget *getSubtargetImpl() const override { + return &Subtarget; + } + const AMDGPURegisterInfo *getRegisterInfo() const override { return &InstrInfo->getRegisterInfo(); } - virtual AMDGPUTargetLowering *getTargetLowering() const { + AMDGPUTargetLowering *getTargetLowering() const override { return TLInfo.get(); } - virtual const InstrItineraryData *getInstrItineraryData() const { + const InstrItineraryData *getInstrItineraryData() const override { return InstrItins; } - virtual const DataLayout *getDataLayout() const { return &Layout; } - virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + const DataLayout *getDataLayout() const override { return &Layout; } + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; /// \brief Register R600 analysis passes with a pass manager. - virtual void addAnalysisPasses(PassManagerBase &PM); + void addAnalysisPasses(PassManagerBase &PM) override; }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index 51225eb..ea78f43 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -15,7 +15,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "AMDGPUtti" #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "llvm/Analysis/LoopInfo.h" @@ -26,6 +25,8 @@ #include "llvm/Target/TargetLowering.h" using namespace llvm; +#define DEBUG_TYPE "AMDGPUtti" + // Declare the pass initialization routine locally as target-specific passes // don't have a target-wide initialization entry point, and so we rely on the // pass constructor initialization. @@ -45,7 +46,7 @@ class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { llvm_unreachable("This pass cannot be directly constructed"); } @@ -55,9 +56,9 @@ public: initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); } - virtual void initializePass() override { pushTTIStack(this); } + void initializePass() override { pushTTIStack(this); } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { TargetTransformInfo::getAnalysisUsage(AU); } @@ -65,15 +66,16 @@ public: static char ID; /// Provide necessary pointer adjustments for the two base classes. - virtual void *getAdjustedAnalysisPointer(const void *ID) override { + void *getAdjustedAnalysisPointer(const void *ID) override { if (ID == &TargetTransformInfo::ID) return (TargetTransformInfo *)this; return this; } - virtual bool hasBranchDivergence() const override; + bool hasBranchDivergence() const override; - virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const; + void getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const override; /// @} }; @@ -109,11 +111,11 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L, // require us to use indirect addressing, which is slow and prone to // compiler bugs. If this loop does an address calculation on an // alloca ptr, then we want to use a higher than normal loop unroll - // threshold. This will give SROA a better chance to eliminate these - // allocas. - // - // Don't use the maximum allowed value here as it will make some - // programs way too big. + // threshold. This will give SROA a better chance to eliminate these + // allocas. + // + // Don't use the maximum allowed value here as it will make some + // programs way too big. UP.Threshold = 500; } } diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index 21ca560..f3a0391 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -8,8 +8,6 @@ /// \file //==-----------------------------------------------------------------------===// -#define DEBUG_TYPE "structcfg" - #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "R600InstrInfo.h" @@ -34,6 +32,8 @@ using namespace llvm; +#define DEBUG_TYPE "structcfg" + #define DEFAULT_VEC_SLOTS 8 // TODO: move-begin. @@ -135,15 +135,15 @@ public: static char ID; AMDGPUCFGStructurizer() : - MachineFunctionPass(ID), TII(NULL), TRI(NULL) { + MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); } - const char *getPassName() const { + const char *getPassName() const override { return "AMDGPU Control Flow Graph structurizer Pass"; } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<MachineFunctionAnalysis>(); AU.addRequired<MachineFunctionAnalysis>(); AU.addRequired<MachineDominatorTree>(); @@ -159,7 +159,7 @@ public: /// sure all loops have an exit block bool prepare(); - bool runOnMachineFunction(MachineFunction &MF) { + bool runOnMachineFunction(MachineFunction &MF) override { TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); @@ -168,7 +168,7 @@ public: MLI = &getAnalysis<MachineLoopInfo>(); DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); MDT = &getAnalysis<MachineDominatorTree>(); - DEBUG(MDT->print(dbgs(), (const llvm::Module*)0);); + DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); PDT = &getAnalysis<MachinePostDominatorTree>(); DEBUG(PDT->print(dbgs());); prepare(); @@ -334,7 +334,7 @@ protected: MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); void recordSccnum(MachineBasicBlock *MBB, int SCCNum); void retireBlock(MachineBasicBlock *MBB); - void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL); + void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); /// This is work around solution for findNearestCommonDominator not avaiable @@ -361,7 +361,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) const { LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); if (It == LLInfoMap.end()) - return NULL; + return nullptr; return (*It).second; } @@ -632,7 +632,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( MachineInstr *MI = &*It; if (MI && (isCondBranch(MI) || isUncondBranch(MI))) return MI; - return NULL; + return nullptr; } MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( @@ -648,7 +648,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( break; } } - return NULL; + return nullptr; } MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { @@ -658,7 +658,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { if (instr->getOpcode() == AMDGPU::RETURN) return instr; } - return NULL; + return nullptr; } MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { @@ -668,7 +668,7 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { if (MI->getOpcode() == AMDGPU::CONTINUE) return MI; } - return NULL; + return nullptr; } bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { @@ -819,7 +819,7 @@ bool AMDGPUCFGStructurizer::run() { SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter = It; - MachineBasicBlock *SccBeginMBB = NULL; + MachineBasicBlock *SccBeginMBB = nullptr; int SccNumBlk = 0; // The number of active blocks, init to a // maximum possible number. int SccNumIter; // Number of iteration in this SCC. @@ -874,7 +874,7 @@ bool AMDGPUCFGStructurizer::run() { } if (ContNextScc) - SccBeginMBB = NULL; + SccBeginMBB = nullptr; } //while, "one iteration" over the function. MachineBasicBlock *EntryMBB = @@ -933,7 +933,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { MachineBasicBlock *MBB; for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd(); ++It, ++SccNum) { - std::vector<MachineBasicBlock *> &SccNext = *It; + const std::vector<MachineBasicBlock *> &SccNext = *It; for (std::vector<MachineBasicBlock *>::const_iterator blockIter = SccNext.begin(), blockEnd = SccNext.end(); blockIter != blockEnd; ++blockIter) { @@ -1026,7 +1026,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { // Triangle pattern, false is empty LandBlk = FalseMBB; - FalseMBB = NULL; + FalseMBB = nullptr; } else if (FalseMBB->succ_size() == 1 && *FalseMBB->succ_begin() == TrueMBB) { // Triangle pattern, true is empty @@ -1034,7 +1034,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { std::swap(TrueMBB, FalseMBB); reversePredicateSetter(MBB->end()); LandBlk = FalseMBB; - FalseMBB = NULL; + FalseMBB = nullptr; } else if (FalseMBB->succ_size() == 1 && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { LandBlk = *FalseMBB->succ_begin(); @@ -1075,13 +1075,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { int AMDGPUCFGStructurizer::loopendPatternMatch() { std::vector<MachineLoop *> NestedLoops; - for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); - It != E; ++It) { - df_iterator<MachineLoop *> LpIt = df_begin(*It), - LpE = df_end(*It); - for (; LpIt != LpE; ++LpIt) - NestedLoops.push_back(*LpIt); - } + for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E; + ++It) + for (MachineLoop *ML : depth_first(*It)) + NestedLoops.push_back(ML); + if (NestedLoops.size() == 0) return 0; @@ -1244,7 +1242,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, DEBUG( dbgs() << " not working\n"; ); - DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL; + DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; } // walk down the postDomTree return Num; @@ -1723,11 +1721,11 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); if (!LoopHeader || !LoopLatch) - return NULL; + return nullptr; MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); // Is LoopRep an infinite loop ? if (!BranchMI || !isUncondBranch(BranchMI)) - return NULL; + return nullptr; MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); FuncRep->push_back(DummyExitBlk); //insert to function @@ -1860,7 +1858,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); if (!Node1 || !Node2) - return NULL; + return nullptr; Node1 = Node1->getIDom(); while (Node1) { @@ -1869,7 +1867,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, Node1 = Node1->getIDom(); } - return NULL; + return nullptr; } MachineBasicBlock * diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 0761ff4..7cea803 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -39,61 +39,55 @@ using namespace llvm; // TargetLowering Class Implementation Begins //===----------------------------------------------------------------------===// void AMDGPUTargetLowering::InitAMDILLowering() { - static const int types[] = { - (int)MVT::i8, - (int)MVT::i16, - (int)MVT::i32, - (int)MVT::f32, - (int)MVT::f64, - (int)MVT::i64, - (int)MVT::v2i8, - (int)MVT::v4i8, - (int)MVT::v2i16, - (int)MVT::v4i16, - (int)MVT::v4f32, - (int)MVT::v4i32, - (int)MVT::v2f32, - (int)MVT::v2i32, - (int)MVT::v2f64, - (int)MVT::v2i64 + static const MVT::SimpleValueType types[] = { + MVT::i8, + MVT::i16, + MVT::i32, + MVT::f32, + MVT::f64, + MVT::i64, + MVT::v2i8, + MVT::v4i8, + MVT::v2i16, + MVT::v4i16, + MVT::v4f32, + MVT::v4i32, + MVT::v2f32, + MVT::v2i32, + MVT::v2f64, + MVT::v2i64 }; - static const int IntTypes[] = { - (int)MVT::i8, - (int)MVT::i16, - (int)MVT::i32, - (int)MVT::i64 + static const MVT::SimpleValueType IntTypes[] = { + MVT::i8, + MVT::i16, + MVT::i32, + MVT::i64 }; - static const int FloatTypes[] = { - (int)MVT::f32, - (int)MVT::f64 + static const MVT::SimpleValueType FloatTypes[] = { + MVT::f32, + MVT::f64 }; - static const int VectorTypes[] = { - (int)MVT::v2i8, - (int)MVT::v4i8, - (int)MVT::v2i16, - (int)MVT::v4i16, - (int)MVT::v4f32, - (int)MVT::v4i32, - (int)MVT::v2f32, - (int)MVT::v2i32, - (int)MVT::v2f64, - (int)MVT::v2i64 + static const MVT::SimpleValueType VectorTypes[] = { + MVT::v2i8, + MVT::v4i8, + MVT::v2i16, + MVT::v4i16, + MVT::v4f32, + MVT::v4i32, + MVT::v2f32, + MVT::v2i32, + MVT::v2f64, + MVT::v2i64 }; - const size_t NumTypes = array_lengthof(types); - const size_t NumFloatTypes = array_lengthof(FloatTypes); - const size_t NumIntTypes = array_lengthof(IntTypes); - const size_t NumVectorTypes = array_lengthof(VectorTypes); const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>(); // These are the current register classes that are // supported - for (unsigned int x = 0; x < NumTypes; ++x) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; - + for (MVT VT : types) { setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SUBC, VT, Expand); setOperationAction(ISD::ADDE, VT, Expand); @@ -109,9 +103,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::SDIV, VT, Custom); } } - for (unsigned int x = 0; x < NumFloatTypes; ++x) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x]; - + for (MVT VT : FloatTypes) { // IL does not have these operations for floating point types setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); setOperationAction(ISD::SETOLT, VT, Expand); @@ -124,9 +116,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::SETULE, VT, Expand); } - for (unsigned int x = 0; x < NumIntTypes; ++x) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x]; - + for (MVT VT : IntTypes) { // GPU also does not have divrem function for signed or unsigned setOperationAction(ISD::SDIVREM, VT, Expand); @@ -142,9 +132,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::CTLZ, VT, Expand); } - for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii]; - + for (MVT VT : VectorTypes) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp index 762ee39..fab4a3b 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.cpp +++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp @@ -38,7 +38,7 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, }; if (IntrID < Intrinsic::num_intrinsics) { - return 0; + return nullptr; } assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h index 35559e2..924275a 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.h +++ b/lib/Target/R600/AMDILIntrinsicInfo.h @@ -34,13 +34,13 @@ enum ID { class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(TargetMachine *tm); - std::string getName(unsigned int IntrId, Type **Tys = 0, - unsigned int numTys = 0) const; - unsigned int lookupName(const char *Name, unsigned int Len) const; - bool isOverloaded(unsigned int IID) const; + std::string getName(unsigned int IntrId, Type **Tys = nullptr, + unsigned int numTys = 0) const override; + unsigned int lookupName(const char *Name, unsigned int Len) const override; + bool isOverloaded(unsigned int IID) const override; Function *getDeclaration(Module *M, unsigned int ID, - Type **Tys = 0, - unsigned int numTys = 0) const; + Type **Tys = nullptr, + unsigned int numTys = 0) const override; }; } // end namespace llvm diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td index 658deb5..4a3e02e 100644 --- a/lib/Target/R600/AMDILIntrinsics.td +++ b/lib/Target/R600/AMDILIntrinsics.td @@ -92,10 +92,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in { BinaryIntInt; def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, BinaryIntInt; - def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">, - BinaryIntInt; - def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">, - BinaryIntInt; def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, BinaryIntInt; def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 93a5117..3c6fa5a 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -45,6 +45,7 @@ add_llvm_target(R600CodeGen SIInstrInfo.cpp SIISelLowering.cpp SILowerControlFlow.cpp + SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIRegisterInfo.cpp SITypeRewriter.cpp diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td index acd7bde..2630345 100644 --- a/lib/Target/R600/CaymanInstructions.td +++ b/lib/Target/R600/CaymanInstructions.td @@ -21,12 +21,14 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">; let Predicates = [isCayman] in { def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", - [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU + [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU >; def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", - [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU >; +def : IMad24Pat<MULADD_INT24_cm>; + let isVector = 1 in { def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; @@ -47,6 +49,7 @@ def COS_cm : COS_Common<0x8E>; def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; +defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>; // RECIP_UINT emulation for Cayman // The multiplication scales from [0,1] to the unsigned integer range diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index 6430ca6..2065441 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -75,6 +75,8 @@ def COS_eg : COS_Common<0x8E>; def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; +defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>; + //===----------------------------------------------------------------------===// // Memory read/write instructions //===----------------------------------------------------------------------===// @@ -273,7 +275,7 @@ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", VecALU >; -def BFE_INT_eg : R600_3OP <0x4, "BFE_INT", +def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], VecALU >; @@ -286,6 +288,13 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", VecALU >; +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; +def : Pat<(i32 (sext_inreg i32:$src, i8)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; +def : Pat<(i32 (sext_inreg i32:$src, i16)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; + defm : BFIPatterns <BFI_INT_eg>; def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", @@ -294,8 +303,11 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", >; def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", - [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU + [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU >; + +def : UMad24Pat<MULADD_UINT24_eg>; + def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; @@ -309,7 +321,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>; def MUL_LIT_eg : MUL_LIT_Common<0x1F>; def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", - [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU >; def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 7105879..11ae091 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -12,6 +12,8 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; @@ -23,6 +25,21 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, printAnnotation(OS, Annot); } +void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); +} + void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { switch (reg) { case AMDGPU::VCC: @@ -41,43 +58,78 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { break; } - // It's seems there's no way to use SIRegisterInfo here, and dealing with the - // giant enum of all the different shifted sets of registers is pretty - // unmanagable, so parse the name and reformat it to be prettier. - StringRef Name(getRegisterName(reg)); - - std::pair<StringRef, StringRef> Split = Name.split('_'); - StringRef SubRegName = Split.first; - StringRef Rest = Split.second; + char Type; + unsigned NumRegs; + + if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 3; + } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 16; + } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 16; + } else { + O << getRegisterName(reg); + return; + } - if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR". - O << Name; + // The low 8 bits encoding value is the register index, for both VGPRs and + // SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + if (NumRegs == 1) { + O << Type << RegIdx; return; } - unsigned RegIndex; - StringRef RegIndexStr = SubRegName.drop_front(4); + O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; +} - if (RegIndexStr.getAsInteger(10, RegIndex)) { - O << Name; +void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) { + int32_t SImm = static_cast<int32_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; return; } - if (SubRegName.front() == 'V') - O << 'v'; - else if (SubRegName.front() == 'S') - O << 's'; - else { - O << Name; + if (Imm == FloatToBits(1.0f) || + Imm == FloatToBits(-1.0f) || + Imm == FloatToBits(0.5f) || + Imm == FloatToBits(-0.5f) || + Imm == FloatToBits(2.0f) || + Imm == FloatToBits(-2.0f) || + Imm == FloatToBits(4.0f) || + Imm == FloatToBits(-4.0f)) { + O << BitsToFloat(Imm); return; } - if (Rest.empty()) // Only 1 32-bit register - O << RegIndex; - else { - unsigned NumReg = Rest.count('_') + 2; - O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']'; - } + O << formatHex(static_cast<uint64_t>(Imm)); } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -95,7 +147,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, break; } } else if (Op.isImm()) { - O << Op.getImm(); + printImmediate(Op.getImm(), O); } else if (Op.isFPImm()) { O << Op.getFPImm(); } else if (Op.isExpr()) { @@ -106,6 +158,18 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & 0x1) + O << "-"; + if (InputModifiers & 0x2) + O << "|"; + printOperand(MI, OpNo + 1, O); + if (InputModifiers & 0x2) + O << "|"; +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNum).getImm(); diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 1d24680..6ca7170 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -29,13 +29,18 @@ public: void printInstruction(const MCInst *MI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); - virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; private: - static void printRegOperand(unsigned RegNo, raw_ostream &O); - static void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRegOperand(unsigned RegNo, raw_ostream &O); + void printImmediate(uint32_t Imm, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); - static void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm, StringRef Default = ""); static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index a6bb59f..489cec7 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -23,8 +23,8 @@ namespace { class AMDGPUMCObjectWriter : public MCObjectWriter { public: AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { } - virtual void ExecutePostLayoutBinding(MCAssembler &Asm, - const MCAsmLayout &Layout) { + void ExecutePostLayoutBinding(MCAssembler &Asm, + const MCAsmLayout &Layout) override { //XXX: Implement if necessary. } void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout, @@ -34,7 +34,7 @@ public: assert(!"Not implemented"); } - virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout); + void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; }; @@ -43,19 +43,19 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend() {} - virtual unsigned getNumFixupKinds() const { return 0; }; - virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const; - virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const { + unsigned getNumFixupKinds() const override { return 0; }; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { return false; } - virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const { + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { assert(!"Not implemented"); } - virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; } - virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const { + bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { return true; } }; @@ -88,7 +88,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { public: ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createAMDGPUELFObjectWriter(OS); } }; diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index aee9bd1..78bbe0a 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -35,7 +35,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() { Data16bitsDirective = ".short\t"; Data32bitsDirective = ".long\t"; Data64bitsDirective = ".quad\t"; - GPRel32Directive = 0; + GPRel32Directive = nullptr; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; @@ -58,5 +58,5 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() { const MCSection* AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const { - return 0; + return nullptr; } diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h index 22afd63..59aebec 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -22,7 +22,7 @@ class StringRef; class AMDGPUMCAsmInfo : public MCAsmInfo { public: explicit AMDGPUMCAsmInfo(StringRef &TT); - const MCSection* getNonexecutableStackSection(MCContext &CTX) const; + const MCSection* getNonexecutableStackSection(MCContext &CTX) const override; }; } // namespace llvm #endif // AMDGPUMCASMINFO_H diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 6592b0e..38a2956 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -24,6 +24,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +using namespace llvm; + #define GET_INSTRINFO_MC_DESC #include "AMDGPUGenInstrInfo.inc" @@ -33,8 +35,6 @@ #define GET_REGINFO_MC_DESC #include "AMDGPUGenRegisterInfo.inc" -using namespace llvm; - static MCInstrInfo *createAMDGPUMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitAMDGPUMCInstrInfo(X); diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt index b1beab0..74b8ca0 100644 --- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; +;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -19,5 +19,5 @@ type = Library name = R600Desc parent = R600 -required_libraries = R600AsmPrinter R600Info MC +required_libraries = MC R600AsmPrinter R600Info Support add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 286c7d1..5e7cefe 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -41,14 +41,14 @@ public: : MCII(mcii), MRI(mri) { } /// \brief Encode the instruction and write it to the OS. - virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI) const override; /// \returns the encoding for an MCOperand. - virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; private: void EmitByte(unsigned int byte, raw_ostream &OS) const; diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index f42e978..ee02111 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -54,14 +54,14 @@ public: ~SIMCCodeEmitter() { } /// \brief Encode the instruction and write it to the OS. - virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI) const override; /// \returns the encoding for an MCOperand. - virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // End anonymous namespace diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index fde4481..ce17d7c 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -106,3 +106,5 @@ def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>; def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>; def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>; + +def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>; diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp index 3d9015c..92bf0df 100644 --- a/lib/Target/R600/R600ClauseMergePass.cpp +++ b/lib/Target/R600/R600ClauseMergePass.cpp @@ -13,7 +13,6 @@ /// It needs to be called after IfCvt for best results. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "r600mergeclause" #include "AMDGPU.h" #include "R600Defines.h" #include "R600InstrInfo.h" @@ -27,6 +26,8 @@ using namespace llvm; +#define DEBUG_TYPE "r600mergeclause" + namespace { static bool isCFAlu(const MachineInstr *MI) { @@ -62,9 +63,9 @@ private: public: R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const; + const char *getPassName() const override; }; char R600ClauseMergePass::ID = 0; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index f74bef3..d255e96 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -12,7 +12,6 @@ /// computing their address on the fly ; it also sets STACK_SIZE info. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "r600cf" #include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "R600Defines.h" @@ -26,6 +25,8 @@ using namespace llvm; +#define DEBUG_TYPE "r600cf" + namespace { struct CFStack { @@ -468,13 +469,13 @@ private: public: R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), - TII (0), TRI(0), + TII (nullptr), TRI(nullptr), ST(tm.getSubtarget<AMDGPUSubtarget>()) { const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); MaxFetchInst = ST.getTexVTXClauseSize(); } - virtual bool runOnMachineFunction(MachineFunction &MF) { + bool runOnMachineFunction(MachineFunction &MF) override { TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); @@ -501,13 +502,13 @@ public: DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; - LastAlu.back() = 0; + LastAlu.back() = nullptr; continue; } MachineBasicBlock::iterator MI = I; if (MI->getOpcode() != AMDGPU::ENDIF) - LastAlu.back() = 0; + LastAlu.back() = nullptr; if (MI->getOpcode() == AMDGPU::CF_ALU) LastAlu.back() = MI; I++; @@ -558,7 +559,7 @@ public: break; } case AMDGPU::IF_PREDICATE_SET: { - LastAlu.push_back(0); + LastAlu.push_back(nullptr); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(0) @@ -665,7 +666,7 @@ public: return false; } - const char *getPassName() const { + const char *getPassName() const override { return "R600 Control Flow Finalizer Pass"; } }; diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 5bd793a..38afebe 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -291,12 +291,12 @@ private: public: static char ID; - R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(0), Address(0) { + R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnMachineFunction(MachineFunction &MF) { + bool runOnMachineFunction(MachineFunction &MF) override { TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); @@ -315,7 +315,7 @@ public: return false; } - const char *getPassName() const { + const char *getPassName() const override { return "R600 Emit Clause Markers Pass"; } }; diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index ca1189d..732b06d 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -38,11 +38,11 @@ private: public: R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), - TII(0) { } + TII(nullptr) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const { + const char *getPassName() const override { return "R600 Expand special instructions pass"; } }; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 6405a82..d6c6830 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -82,9 +82,31 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::f32, Expand); setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v2f32, Expand); setOperationAction(ISD::SELECT, MVT::v4i32, Expand); - setOperationAction(ISD::SELECT, MVT::v4f32, Expand); + + // Expand sign extension of vectors + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); @@ -117,6 +139,11 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + // These should be replaced by UDVIREM, but it does not happen automatically + // during Type Legalization + setOperationAction(ISD::UDIV, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setBooleanContents(ZeroOrNegativeOneBooleanContent); @@ -538,8 +565,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(2, MVT::i32), // SWZ_Z DAG.getConstant(3, MVT::i32) // SWZ_W }; - return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), - Args, 8); + return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args); } // default for switch(IntrinsicID) @@ -689,7 +715,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const Op.getOperand(9), Op.getOperand(10) }; - return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); } case AMDGPUIntrinsic::AMDGPU_dp4: { SDValue Args[8] = { @@ -710,7 +736,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), DAG.getConstant(3, MVT::i32)) }; - return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); + return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); } case Intrinsic::r600_read_ngroups_x: @@ -960,13 +986,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); } - - // Possible Min/Max pattern - SDValue MinMax = LowerMinMax(Op, DAG); - if (MinMax.getNode()) { - return MinMax; - } - // If we make it this for it means we have no native instructions to handle // this SELECT_CC, so we must lower it. SDValue HWTrue, HWFalse; @@ -1088,10 +1107,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(0, MVT::i32), Mask }; - SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4); + SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); SDValue Args[3] = { Chain, Input, DWordAddr }; return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, - Op->getVTList(), Args, 3, MemVT, + Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && Value.getValueType().bitsGE(MVT::i32)) { @@ -1131,7 +1150,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (ValueVT.isVector()) { unsigned NumElemVT = ValueVT.getVectorNumElements(); EVT ElemVT = ValueVT.getVectorElementType(); - SDValue Stores[4]; + SmallVector<SDValue, 4> Stores(NumElemVT); assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " "vector width in load"); @@ -1148,7 +1167,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { Chain, Elem, Ptr, DAG.getTargetConstant(Channel, MVT::i32)); } - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); } else { if (ValueVT == MVT::i8) { Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); @@ -1212,10 +1231,11 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); if (Ret.getNode()) { - SDValue Ops[2]; - Ops[0] = Ret; - Ops[1] = Chain; - return DAG.getMergeValues(Ops, 2, DL); + SDValue Ops[2] = { + Ret, + Chain + }; + return DAG.getMergeValues(Ops, DL); } @@ -1224,7 +1244,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const SplitVectorLoad(Op, DAG), Chain }; - return DAG.getMergeValues(MergedValues, 2, DL); + return DAG.getMergeValues(MergedValues, DL); } int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); @@ -1232,8 +1252,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { SDValue Result; - if (isa<ConstantExpr>(LoadNode->getSrcValue()) || - isa<Constant>(LoadNode->getSrcValue()) || + if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || + isa<Constant>(LoadNode->getMemOperand()->getValue()) || isa<ConstantSDNode>(Ptr)) { SDValue Slots[4]; for (unsigned i = 0; i < 4; i++) { @@ -1252,7 +1272,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const NewVT = VT; NumElements = VT.getVectorNumElements(); } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements); + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, + makeArrayRef(Slots, NumElements)); } else { // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, @@ -1268,10 +1289,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const } SDValue MergedValues[2] = { - Result, - Chain + Result, + Chain }; - return DAG.getMergeValues(MergedValues, 2, DL); + return DAG.getMergeValues(MergedValues, DL); } // For most operations returning SDValue() will result in the node being @@ -1295,7 +1316,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); SDValue MergedValues[2] = { Sra, Chain }; - return DAG.getMergeValues(MergedValues, 2, DL); + return DAG.getMergeValues(MergedValues, DL); } if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { @@ -1332,7 +1353,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const Loads[i] = DAG.getUNDEF(ElemVT); } EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); } else { LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, Chain, Ptr, @@ -1340,11 +1361,12 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const Op.getOperand(2)); } - SDValue Ops[2]; - Ops[0] = LoweredLoad; - Ops[1] = Chain; + SDValue Ops[2] = { + LoweredLoad, + Chain + }; - return DAG.getMergeValues(Ops, 2, DL); + return DAG.getMergeValues(Ops, DL); } /// XXX Only kernel functions are supported, so we can assume for now that @@ -1365,8 +1387,7 @@ SDValue R600TargetLowering::LowerFormalArguments( SmallVector<ISD::InputArg, 8> LocalIns; - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - LocalIns); + getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); AnalyzeFormalArguments(CCInfo, LocalIns); @@ -1392,32 +1413,38 @@ SDValue R600TargetLowering::LowerFormalArguments( // The first 36 bytes of the input buffer contains information about // thread group and global sizes. - SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, + + // FIXME: This should really check the extload type, but the handling of + // extload vecto parameters seems to be broken. + //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + ISD::LoadExtType Ext = ISD::SEXTLOAD; + SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain, DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, false, false, 4); - // 4 is the preferred alignment for - // the CONSTANT memory space. + + // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); } return Chain; } EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) return MVT::i32; + if (!VT.isVector()) + return MVT::i32; return VT.changeVectorElementTypeToInteger(); } -static SDValue -CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry, - DenseMap<unsigned, unsigned> &RemapSwizzle) { +static SDValue CompactSwizzlableVector( + SelectionDAG &DAG, SDValue VectorEntry, + DenseMap<unsigned, unsigned> &RemapSwizzle) { assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); assert(RemapSwizzle.empty()); SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) }; for (unsigned i = 0; i < 4; i++) { @@ -1448,7 +1475,7 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry, } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec, 4); + VectorEntry.getValueType(), NewBldVec); } static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, @@ -1486,7 +1513,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, } return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec, 4); + VectorEntry.getValueType(), NewBldVec); } @@ -1524,6 +1551,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) case ISD::FP_ROUND: { SDValue Arg = N->getOperand(0); @@ -1613,8 +1641,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } // Return the new vector - return DAG.getNode(ISD::BUILD_VECTOR, dl, - VT, &Ops[0], Ops.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } // Extract_vec (Build_vector) generated by custom lowering @@ -1638,6 +1665,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } case ISD::SELECT_CC: { + // Try common optimizations + SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + if (Ret.getNode()) + return Ret; + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> // selectcc x, y, a, b, inv(cc) // @@ -1697,7 +1729,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, }; SDLoc DL(N); NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); - return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); + return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); } case AMDGPUISD::TEXTURE_FETCH: { SDValue Arg = N->getOperand(1); @@ -1727,10 +1759,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, }; NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), - NewArgs, 19); + NewArgs); } } - return SDValue(); + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } static bool @@ -1779,8 +1812,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) }; std::vector<unsigned> Consts; - for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) { - int OtherSrcIdx = SrcIndices[i]; + for (int OtherSrcIdx : SrcIndices) { int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); if (OtherSrcIdx < 0 || OtherSelIdx < 0) continue; @@ -1791,14 +1823,14 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { if (Reg->getReg() == AMDGPU::ALU_CONST) { - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>( - ParentNode->getOperand(OtherSelIdx)); + ConstantSDNode *Cst + = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); Consts.push_back(Cst->getZExtValue()); } } } - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); + ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); Consts.push_back(Cst->getZExtValue()); if (!TII->fitsConstReadLimitations(Consts)) { return false; diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index 22ef728..a8a464f 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -24,21 +24,21 @@ class R600InstrInfo; class R600TargetLowering : public AMDGPUTargetLowering { public: R600TargetLowering(TargetMachine &TM); - virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock * BB) const; - virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; - virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - virtual void ReplaceNodeResults(SDNode * N, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) const override; - virtual SDValue LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; - virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock * BB) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + EVT getSetCCResultType(LLVMContext &, EVT VT) const override; private: unsigned Gen; /// Each OpenCL kernel has nine implicit parameters that are stored in the @@ -66,7 +66,7 @@ private: void getStackAddress(unsigned StackWidth, unsigned ElemIdx, unsigned &Channel, unsigned &PtrIncr) const; bool isZero(SDValue Op) const; - virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; }; } // End namespace llvm; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 0281dd0..b0d9ae3 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -23,11 +23,11 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +using namespace llvm; + #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenDFAPacketizer.inc" -using namespace llvm; - R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) : AMDGPUInstrInfo(tm), RI(tm), @@ -677,7 +677,7 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB, return MI; } - return NULL; + return nullptr; } static @@ -797,7 +797,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, DebugLoc DL) const { assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - if (FBB == 0) { + if (!FBB) { if (Cond.empty()) { BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); return 1; diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index d5ff4de..b5304a0 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -50,13 +50,13 @@ namespace llvm { explicit R600InstrInfo(AMDGPUTargetMachine &tm); - const R600RegisterInfo &getRegisterInfo() const; - virtual void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const; + const R600RegisterInfo &getRegisterInfo() const override; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const; + MachineBasicBlock::iterator MBBI) const override; bool isTrig(const MachineInstr &MI) const; bool isPlaceHolderOpcode(unsigned opcode) const; @@ -142,79 +142,79 @@ namespace llvm { /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; - virtual unsigned getIEQOpcode() const; - virtual bool isMov(unsigned Opcode) const; + unsigned getIEQOpcode() const override; + bool isMov(unsigned Opcode) const override; DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM, - const ScheduleDAG *DAG) const; + const ScheduleDAG *DAG) const override; - bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const; + SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override; - unsigned RemoveBranch(MachineBasicBlock &MBB) const; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - bool isPredicated(const MachineInstr *MI) const; + bool isPredicated(const MachineInstr *MI) const override; - bool isPredicable(MachineInstr *MI) const; + bool isPredicable(MachineInstr *MI) const override; bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) const; + const BranchProbability &Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const ; + const BranchProbability &Probability) const override ; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const; + const BranchProbability &Probability) const override; bool DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const; + std::vector<MachineOperand> &Pred) const override; bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const; + const SmallVectorImpl<MachineOperand> &Pred2) const override; bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const; + MachineBasicBlock &FMBB) const override; bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const; + const SmallVectorImpl<MachineOperand> &Pred) const override; - unsigned int getPredicationCost(const MachineInstr *) const; + unsigned int getPredicationCost(const MachineInstr *) const override; unsigned int getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr *MI, - unsigned *PredCost = 0) const; + unsigned *PredCost = nullptr) const override; - virtual int getInstrLatency(const InstrItineraryData *ItinData, - SDNode *Node) const { return 1;} + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const override { return 1;} /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; - virtual unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const; + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; - virtual const TargetRegisterClass *getIndirectAddrRegClass() const; + const TargetRegisterClass *getIndirectAddrRegClass() const override; - virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const; + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; - virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const; + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; unsigned getMaxAlusPerClause() const; @@ -244,7 +244,7 @@ namespace llvm { MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const; + unsigned DstReg, unsigned SrcReg) const override; /// \brief Get the index of Op in the MachineInstr. /// diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index d2075c0..590fde2 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1625,6 +1625,12 @@ def : DwordAddrPat <i32, R600_Reg32>; } // End isR600toCayman Predicate +let Predicates = [isR600] in { +// Intrinsic patterns +defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>; +defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>; +} // End isR600 + def getLDSNoRetOp : InstrMapping { let FilterClass = "R600_LDS_1A1D"; let RowFields = ["BaseOp"]; diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index c1bec0a..b0ae22e 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -21,7 +21,7 @@ namespace llvm { class R600MachineFunctionInfo : public AMDGPUMachineFunction { - virtual void anchor(); + void anchor() override; public: R600MachineFunctionInfo(const MachineFunction &MF); SmallVector<unsigned, 4> LiveOuts; diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index d3ffb50..d1655d1 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -12,8 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "misched" - #include "R600MachineScheduler.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -23,6 +21,8 @@ using namespace llvm; +#define DEBUG_TYPE "misched" + void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); DAG = static_cast<ScheduleDAGMILive*>(dag); @@ -56,7 +56,7 @@ unsigned getWFCountLimitedByGPR(unsigned GPRCount) { } SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { - SUnit *SU = 0; + SUnit *SU = nullptr; NextInstKind = IDOther; IsTopNode = false; @@ -316,7 +316,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) { SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { if (Q.empty()) - return NULL; + return nullptr; for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); It != E; ++It) { SUnit *SU = *It; @@ -331,7 +331,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { InstructionsGroupCandidate.pop_back(); } } - return NULL; + return nullptr; } void R600SchedStrategy::LoadAlu() { @@ -448,11 +448,11 @@ SUnit* R600SchedStrategy::pickAlu() { } PrepareNextSlot(); } - return NULL; + return nullptr; } SUnit* R600SchedStrategy::pickOther(int QID) { - SUnit *SU = 0; + SUnit *SU = nullptr; std::vector<SUnit *> &AQ = Available[QID]; if (AQ.empty()) { diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h index b909ff7..fd475af 100644 --- a/lib/Target/R600/R600MachineScheduler.h +++ b/lib/Target/R600/R600MachineScheduler.h @@ -68,17 +68,16 @@ class R600SchedStrategy : public MachineSchedStrategy { public: R600SchedStrategy() : - DAG(0), TII(0), TRI(0), MRI(0) { + DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { } - virtual ~R600SchedStrategy() { - } + virtual ~R600SchedStrategy() {} - virtual void initialize(ScheduleDAGMI *dag); - virtual SUnit *pickNode(bool &IsTopNode); - virtual void schedNode(SUnit *SU, bool IsTopNode); - virtual void releaseTopNode(SUnit *SU); - virtual void releaseBottomNode(SUnit *SU); + void initialize(ScheduleDAGMI *dag) override; + SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void releaseTopNode(SUnit *SU) override; + void releaseBottomNode(SUnit *SU) override; private: std::vector<MachineInstr *> InstructionsGroupCandidate; diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp index 767e5e3..2314136 100644 --- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp @@ -27,7 +27,6 @@ /// to reduce MOV count. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "vec-merger" #include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "R600InstrInfo.h" @@ -42,6 +41,8 @@ using namespace llvm; +#define DEBUG_TYPE "vec-merger" + namespace { static bool @@ -107,9 +108,9 @@ private: public: static char ID; R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), - TII(0) { } + TII(nullptr) { } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); @@ -118,11 +119,11 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } - const char *getPassName() const { + const char *getPassName() const override { return "R600 Vector Registers Merge Pass"; } - bool runOnMachineFunction(MachineFunction &Fn); + bool runOnMachineFunction(MachineFunction &Fn) override; }; char R600VectorRegMerger::ID = 0; diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index b7b7610..c2f6c03 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -14,7 +14,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "packets" #include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "R600InstrInfo.h" @@ -28,6 +27,8 @@ using namespace llvm; +#define DEBUG_TYPE "packets" + namespace { class R600Packetizer : public MachineFunctionPass { @@ -36,7 +37,7 @@ public: static char ID; R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); @@ -45,11 +46,11 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } - const char *getPassName() const { + const char *getPassName() const override { return "R600 Packetizer"; } - bool runOnMachineFunction(MachineFunction &Fn); + bool runOnMachineFunction(MachineFunction &Fn) override; }; char R600Packetizer::ID = 0; @@ -155,18 +156,19 @@ public: } // initPacketizerState - initialize some internal flags. - void initPacketizerState() { + void initPacketizerState() override { ConsideredInstUsesAlreadyWrittenVectorElement = false; } // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) { + bool ignorePseudoInstruction(MachineInstr *MI, + MachineBasicBlock *MBB) override { return false; } // isSoloInstruction - return true if instruction MI can not be packetized // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) { + bool isSoloInstruction(MachineInstr *MI) override { if (TII->isVector(*MI)) return true; if (!TII->isALUInstr(MI->getOpcode())) @@ -182,7 +184,7 @@ public: // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ // together. - bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); if (getSlot(MII) == getSlot(MIJ)) ConsideredInstUsesAlreadyWrittenVectorElement = true; @@ -219,7 +221,9 @@ public: // isLegalToPruneDependencies - Is it legal to prune dependece between SUI // and SUJ. - bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;} + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { + return false; + } void setIsLastBit(MachineInstr *MI, unsigned Bit) const { unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); @@ -288,7 +292,7 @@ public: return true; } - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) { + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { MachineBasicBlock::iterator FirstInBundle = CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); const DenseMap<unsigned, unsigned> &PV = diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index c74c49e..52e1a4b 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -28,27 +28,28 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { R600RegisterInfo(AMDGPUTargetMachine &tm); - virtual BitVector getReservedRegs(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; /// \param RC is an AMDIL reg class. /// /// \returns the R600 reg class that is equivalent to \p RC. - virtual const TargetRegisterClass *getISARegClass( - const TargetRegisterClass *RC) const; + const TargetRegisterClass *getISARegClass( + const TargetRegisterClass *RC) const override; /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; - virtual unsigned getHWRegIndex(unsigned Reg) const; + unsigned getHWRegIndex(unsigned Reg) const override; /// \brief get the register class of the specified type to use in the /// CFGStructurizer - virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; + const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const; + const RegClassWeight & + getRegClassWeight(const TargetRegisterClass *RC) const override; // \returns true if \p Reg can be defined in one ALU caluse and used in another. - virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const; + bool isPhysRegLiveAcrossClauses(unsigned Reg) const; }; } // End namespace llvm diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp index 9d24404..419ec8b 100644 --- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp +++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp @@ -209,7 +209,7 @@ public: FunctionPass(ID) { } - virtual bool doInitialization(Module &M) { + bool doInitialization(Module &M) override { LLVMContext &Ctx = M.getContext(); Mod = &M; FloatType = Type::getFloatTy(Ctx); @@ -245,16 +245,16 @@ public: return false; } - virtual bool runOnFunction(Function &F) { + bool runOnFunction(Function &F) override { visit(F); return false; } - virtual const char *getPassName() const { + const char *getPassName() const override { return "R600 Texture Intrinsics Replacer"; } - void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { } void visitCallInst(CallInst &I) { diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp index f9214a8..d6e4451 100644 --- a/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -12,8 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "si-annotate-control-flow" - #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/IR/Constants.h" @@ -26,6 +24,8 @@ using namespace llvm; +#define DEBUG_TYPE "si-annotate-control-flow" + namespace { // Complex types used in this pass @@ -91,15 +91,15 @@ public: SIAnnotateControlFlow(): FunctionPass(ID) { } - virtual bool doInitialization(Module &M); + bool doInitialization(Module &M) override; - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual const char *getPassName() const { + const char *getPassName() const override { return "SI annotate control flow"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); @@ -118,7 +118,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { Void = Type::getVoidTy(Context); Boolean = Type::getInt1Ty(Context); Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64, (Type *)0); + ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); BoolTrue = ConstantInt::getTrue(Context); BoolFalse = ConstantInt::getFalse(Context); @@ -126,25 +126,25 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { Int64Zero = ConstantInt::get(Int64, 0); If = M.getOrInsertFunction( - IfIntrinsic, ReturnStruct, Boolean, (Type *)0); + IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); Else = M.getOrInsertFunction( - ElseIntrinsic, ReturnStruct, Int64, (Type *)0); + ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); Break = M.getOrInsertFunction( - BreakIntrinsic, Int64, Int64, (Type *)0); + BreakIntrinsic, Int64, Int64, (Type *)nullptr); IfBreak = M.getOrInsertFunction( - IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0); + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); ElseBreak = M.getOrInsertFunction( - ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0); + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); Loop = M.getOrInsertFunction( - LoopIntrinsic, Boolean, Int64, (Type *)0); + LoopIntrinsic, Boolean, Int64, (Type *)nullptr); EndCf = M.getOrInsertFunction( - EndCfIntrinsic, Void, Int64, (Type *)0); + EndCfIntrinsic, Void, Int64, (Type *)nullptr); return false; } diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp index 402f1f4..5f71453 100644 --- a/lib/Target/R600/SIFixSGPRCopies.cpp +++ b/lib/Target/R600/SIFixSGPRCopies.cpp @@ -65,7 +65,6 @@ /// ultimately led to the creation of an illegal COPY. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "sgpr-copies" #include "AMDGPU.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -77,6 +76,8 @@ using namespace llvm; +#define DEBUG_TYPE "sgpr-copies" + namespace { class SIFixSGPRCopies : public MachineFunctionPass { @@ -97,9 +98,9 @@ private: public: SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const { + const char *getPassName() const override { return "SI Fix SGPR copies"; } @@ -184,7 +185,8 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, const TargetRegisterClass *SrcRC; if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - DstRC == &AMDGPU::M0RegRegClass) + DstRC == &AMDGPU::M0RegRegClass || + MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) return false; SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); @@ -256,6 +258,19 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { TII->moveToVALU(MI); break; } + case AMDGPU::INSERT_SUBREG: { + const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; + DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + if (TRI->isSGPRClass(DstRC) && + (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n"); + DEBUG(MI.print(dbgs())); + TII->moveToVALU(MI); + } + break; + } } } } diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0b55411..c9e247c 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -29,22 +29,21 @@ using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM) { - addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -78,8 +77,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); - setOperationAction(ISD::BITCAST, MVT::i128, Legal); - // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); @@ -99,10 +96,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::i64, Custom); - setOperationAction(ISD::STORE, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -119,6 +117,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); @@ -126,39 +140,48 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setTruncStoreAction(MVT::i32, MVT::i8, Custom); setTruncStoreAction(MVT::i32, MVT::i16, Custom); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::i128, MVT::i64, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + // These should use UDIVREM, so set them to expand + setOperationAction(ISD::UDIV, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. MVT VecTypes[] = { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 }; - const size_t NumVecTypes = array_lengthof(VecTypes); - for (unsigned Type = 0; Type < NumVecTypes; ++Type) { + for (MVT VT : VecTypes) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -172,7 +195,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : case ISD::EXTRACT_SUBVECTOR: break; default: - setOperationAction(Op, VecTypes[Type], Expand); + setOperationAction(Op, VT, Expand); break; } } @@ -189,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); } setTargetDAGCombine(ISD::SELECT_CC); @@ -204,10 +228,40 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace, bool *IsFast) const { + if (IsFast) + *IsFast = false; + // XXX: This depends on the address space and also we may want to revist // the alignment values we specify in the DataLayout. + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. if (!VT.isSimple() || VT == MVT::Other) return false; + + // XXX - CI changes say "Support for unaligned memory accesses" but I don't + // see what for specifically. The wording everywhere else seems to be the + // same. + + // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have + // no alignment restrictions. + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + // Using any pair of GPRs should be the same as any other pair. + if (IsFast) + *IsFast = true; + return VT.bitsGE(MVT::i64); + } + + // XXX - The only mention I see of this in the ISA manual is for LDS direct + // reads the "byte address and must be dword aligned". Is it also true for the + // normal loads and stores? + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) + return false; + + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + if (IsFast) + *IsFast = true; return VT.bitsGT(MVT::i32); } @@ -224,7 +278,7 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, SDValue Chain, - unsigned Offset) const { + unsigned Offset, bool Signed) const { MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), AMDGPUAS::CONSTANT_ADDRESS); @@ -232,7 +286,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, MVT::i64)); - return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, + return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr, MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, false, false, MemVT.getSizeInBits() >> 3); @@ -340,7 +394,8 @@ SDValue SITargetLowering::LowerFormalArguments( // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), - 36 + VA.getLocMemOffset()); + 36 + VA.getLocMemOffset(), + Ins[i].Flags.isSExt()); InVals.push_back(Arg); continue; } @@ -381,8 +436,7 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned j = 0; j != NumElements; ++j) Regs.push_back(DAG.getUNDEF(VT)); - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, - Regs.data(), Regs.size())); + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); continue; } @@ -395,15 +449,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned SuperReg = MI->getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); @@ -428,9 +482,7 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } - case AMDGPU::V_SUB_F64: { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + case AMDGPU::V_SUB_F64: BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) @@ -442,11 +494,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( .addImm(2); /* NEG */ MI->eraseFromParent(); break; - } + case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineInstrBuilder MIB = BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), @@ -455,6 +505,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MIB.addOperand(MI->getOperand(i)); MI->eraseFromParent(); + break; + } + case AMDGPU::FABS_SI: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), + Reg) + .addImm(0x7fffffff); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(Reg); + MI->eraseFromParent(); + break; + } + case AMDGPU::FNEG_SI: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), + Reg) + .addImm(0x80000000); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(Reg); + MI->eraseFromParent(); + break; + } + case AMDGPU::FCLAMP_SI: { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64), + MI->getOperand(0).getReg()) + .addImm(0) // SRC0 modifiers + .addOperand(MI->getOperand(1)) + .addImm(0) // SRC1 modifiers + .addImm(0) // SRC1 + .addImm(1) // CLAMP + .addImm(0); // OMOD + MI->eraseFromParent(); } } return BB; @@ -510,7 +604,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SplitVectorLoad(Op, DAG), Load->getChain() }; - return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); + return DAG.getMergeValues(MergedValues, SDLoc(Op)); } else { return LowerLOAD(Op, DAG); } @@ -533,23 +627,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); @@ -570,7 +664,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { AMDGPU::VGPR2, VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops [] = { - ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(1), Op.getOperand(2) }; @@ -579,7 +673,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, VT.getSizeInBits() / 8, 4); return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, 2, VT, MMO); + Op->getVTList(), Ops, VT, MMO); } case AMDGPUIntrinsic::SI_sample: return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); @@ -591,7 +685,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } @@ -606,7 +700,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Ops [] = { Chain, - ResourceDescriptorToi128(Op.getOperand(2), DAG), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), @@ -627,8 +721,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { MachineMemOperand::MOStore, VT.getSizeInBits() / 8, 4); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, - sizeof(Ops)/sizeof(Ops[0]), VT, MMO); + Op->getVTList(), Ops, VT, MMO); } default: break; @@ -650,7 +743,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { if (I->getOpcode() == Opcode) return *I; } - return 0; + return nullptr; } /// This transforms the control flow intrinsics to get the branch destination as @@ -662,7 +755,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); - SDNode *BR = 0; + SDNode *BR = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine @@ -695,7 +788,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, // build the new intrinsic call SDNode *Result = DAG.getNode( Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); + DAG.getVTList(Res), Ops).getNode(); if (BR) { // Give the branch instruction our target @@ -703,7 +796,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, BR->getOperand(0), BRCOND.getOperand(2) }; - DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); + DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops); } SDValue Chain = SDValue(Result, Result->getNumValues() - 1); @@ -739,7 +832,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { MergedValues[1] = Load->getChain(); if (Ret.getNode()) { MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, 2, DL); + return DAG.getMergeValues(MergedValues, DL); } if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { @@ -770,30 +863,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, 2, DL); + return DAG.getMergeValues(MergedValues, DL); } -SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, - SelectionDAG &DAG) const { - - if (Op.getValueType() == MVT::i128) { - return Op; - } - - assert(Op.getOpcode() == ISD::UNDEF); - - return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, - DAG.getConstant(0, MVT::i64), - DAG.getConstant(0, MVT::i64)); -} - SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, SelectionDAG &DAG) const { return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), - ResourceDescriptorToi128(Op.getOperand(3), DAG), + Op.getOperand(3), Op.getOperand(4)); } @@ -833,12 +912,6 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); - // Possible Min/Max pattern - SDValue MinMax = LowerMinMax(Op, DAG); - if (MinMax.getNode()) { - return MinMax; - } - SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); } @@ -948,8 +1021,12 @@ SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, return SDValue(); } - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); + SDValue Src = Op.getOperand(0); + if (Src.getValueType() != MVT::i32) + Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + + SDValue Zero = DAG.getConstant(0, MVT::i32); + return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero); } //===----------------------------------------------------------------------===// @@ -963,7 +1040,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: break; + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SELECT_CC: { ConstantSDNode *True, *False; // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) @@ -982,7 +1059,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue Arg0 = N->getOperand(0); SDValue Arg1 = N->getOperand(1); SDValue CC = N->getOperand(2); - ConstantSDNode * C = NULL; + ConstantSDNode * C = nullptr; ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) @@ -998,7 +1075,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } } - return SDValue(); + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// \brief Test if RegClass is one of the VSrc classes @@ -1029,9 +1107,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { return -1; } Imm.I = Node->getSExtValue(); - } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) + } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (N->getValueType(0) != MVT::f32) + return -1; Imm.F = Node->getValueAPF().convertToFloat(); - else + } else return -1; // It isn't an immediate if ((Imm.I >= -16 && Imm.I <= 64) || @@ -1051,7 +1131,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) + if (!Mov || !TII->isMov(Mov->getMachineOpcode())) return false; const SDValue &Op = Mov->getOperand(0); @@ -1098,7 +1178,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( } return TRI.getPhysRegClass(Reg); } - default: return NULL; + default: return nullptr; } } const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); @@ -1202,17 +1282,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, // Commuted opcode if available int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; - const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); + const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); assert(!DescRev || DescRev->getNumDefs() == NumDefs); assert(!DescRev || DescRev->getNumOperands() == NumOps); // e64 version if available, -1 otherwise int OpcodeE64 = AMDGPU::getVOPe64(Opcode); - const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); + const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); + int InputModifiers[3] = {0}; assert(!DescE64 || DescE64->getNumDefs() == NumDefs); - assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; bool HaveVSrc = false, HaveSSrc = false; @@ -1279,17 +1359,18 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, fitsRegClass(DAG, Ops[1], OtherRegClass))) { // Swap commutable operands - SDValue Tmp = Ops[1]; - Ops[1] = Ops[0]; - Ops[0] = Tmp; + std::swap(Ops[0], Ops[1]); Desc = DescRev; - DescRev = 0; + DescRev = nullptr; continue; } } - if (DescE64 && !Immediate) { + if (Immediate) + continue; + + if (DescE64) { // Test if it makes sense to switch to e64 encoding unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; @@ -1305,14 +1386,46 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, Immediate = -1; Promote2e64 = true; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } } + + if (!DescE64 && !Promote2e64) + continue; + if (!Operand.isMachineOpcode()) + continue; + if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) { + Ops.pop_back(); + Ops.push_back(Operand.getOperand(0)); + InputModifiers[i] = 1; + Promote2e64 = true; + if (!DescE64) + continue; + Desc = DescE64; + DescE64 = 0; + } + else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { + Ops.pop_back(); + Ops.push_back(Operand.getOperand(0)); + InputModifiers[i] = 2; + Promote2e64 = true; + if (!DescE64) + continue; + Desc = DescE64; + DescE64 = 0; + } } if (Promote2e64) { + std::vector<SDValue> OldOps(Ops); + Ops.clear(); + for (unsigned i = 0; i < OldOps.size(); ++i) { + // src_modifier + Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32)); + Ops.push_back(OldOps[i]); + } // Add the modifier flags while promoting - for (unsigned i = 0; i < 4; ++i) + for (unsigned i = 0; i < 2; ++i) Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); } @@ -1390,7 +1503,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) Ops.push_back(Node->getOperand(i)); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy // (if NewDmask has only one bit set...) diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index ca73f53..c6eaa81 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -22,7 +22,7 @@ namespace llvm { class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, - SDValue Chain, unsigned Offset) const; + SDValue Chain, unsigned Offset, bool Signed) const; SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; @@ -33,7 +33,6 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const; bool foldImm(SDValue &Operand, int32_t &Immediate, bool &ScalarSlotUsed) const; const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG, @@ -49,32 +48,33 @@ class SITargetLowering : public AMDGPUTargetLowering { public: SITargetLowering(TargetMachine &tm); - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const; - virtual bool shouldSplitVectorType(EVT VT) const override; + bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, + bool *IsFast) const override; + bool shouldSplitVectorType(EVT VT) const override; - virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override; + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; + SmallVectorImpl<SDValue> &InVals) const override; - virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, - MachineBasicBlock * BB) const; - virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const; - virtual MVT getScalarShiftAmountTy(EVT VT) const; - virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; - virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; - virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const; - virtual void AdjustInstrPostInstrSelection(MachineInstr *MI, - SDNode *Node) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, + MachineBasicBlock * BB) const override; + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + MVT getScalarShiftAmountTy(EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const override; int32_t analyzeImmediate(const SDNode *N) const; SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; + unsigned Reg, EVT VT) const override; }; } // End namespace llvm diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 695ec40..a17fed7 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -97,13 +97,13 @@ private: public: SIInsertWaits(TargetMachine &tm) : MachineFunctionPass(ID), - TII(0), - TRI(0), + TII(nullptr), + TRI(nullptr), ExpInstrTypesSeen(0) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const { + const char *getPassName() const override { return "SI insert wait instructions"; } diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index aa2c22c..168eff2 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -12,7 +12,7 @@ //===----------------------------------------------------------------------===// class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : - AMDGPUInst<outs, ins, asm, pattern> { + AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { field bits<1> VM_CNT = 0; field bits<1> EXP_CNT = 0; @@ -210,16 +210,19 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : Enc64 <outs, ins, asm, pattern> { bits<8> dst; + bits<2> src0_modifiers; bits<9> src0; + bits<2> src1_modifiers; bits<9> src1; + bits<2> src2_modifiers; bits<9> src2; - bits<3> abs; bits<1> clamp; bits<2> omod; - bits<3> neg; let Inst{7-0} = dst; - let Inst{10-8} = abs; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; let Inst{11} = clamp; let Inst{25-17} = op; let Inst{31-26} = 0x34; //encoding @@ -227,7 +230,9 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{49-41} = src1; let Inst{58-50} = src2; let Inst{60-59} = omod; - let Inst{63-61} = neg; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; let mayLoad = 0; let mayStore = 0; @@ -240,12 +245,14 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : Enc64 <outs, ins, asm, pattern> { bits<8> dst; + bits<2> src0_modifiers; bits<9> src0; + bits<2> src1_modifiers; bits<9> src1; + bits<2> src2_modifiers; bits<9> src2; bits<7> sdst; bits<2> omod; - bits<3> neg; let Inst{7-0} = dst; let Inst{14-8} = sdst; @@ -255,7 +262,9 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{49-41} = src1; let Inst{58-50} = src2; let Inst{60-59} = omod; - let Inst{63-61} = neg; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; let mayLoad = 0; let mayStore = 0; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index ab2fe09..4a9e346 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -187,27 +187,45 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned KillFlag = isKill ? RegState::Kill : 0; + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { - unsigned Lane = MFI->SpillTracker.getNextLane(MRI); - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), - MFI->SpillTracker.LaneVGPR) + unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent()); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR) .addReg(SrcReg, KillFlag) .addImm(Lane); + MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane); + } else if (RI.isSGPRClass(RC)) { + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for vector + // registers. + // + // Reserve a spot in the spill tracker for each sub-register of + // the vector register. + unsigned NumSubRegs = RC->getSize() / 4; + unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(), + NumSubRegs); MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, - Lane); - } else { - for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) { - unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg) - .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); - storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i, - &AMDGPU::SReg_32RegClass, TRI); + FirstLane); + + unsigned Opcode; + switch (RC->getSize() * 8) { + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; + default: llvm_unreachable("Cannot spill register class"); } + + BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR) + .addReg(SrcReg) + .addImm(FrameIndex); + } else { + llvm_unreachable("VGPR spilling not supported"); } } @@ -216,30 +234,125 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); DebugLoc DL = MBB.findDebugLoc(MI); if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) { - SIMachineFunctionInfo::SpilledReg Spill = + SIMachineFunctionInfo::SpilledReg Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); assert(Spill.VGPR); BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); + insertNOPs(MI, 3); + } else if (RI.isSGPRClass(RC)){ + unsigned Opcode; + switch(RC->getSize() * 8) { + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; + default: llvm_unreachable("Cannot spill register class"); + } + + SIMachineFunctionInfo::SpilledReg Spill = + MFI->SpillTracker.getSpilledReg(FrameIndex); + + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addReg(Spill.VGPR) + .addImm(FrameIndex); + insertNOPs(MI, 3); } else { - for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) { - unsigned Flags = RegState::Define; - if (i == 0) { - Flags |= RegState::Undef; - } - unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i, - &AMDGPU::SReg_32RegClass, TRI); - BuildMI(MBB, MI, DL, get(AMDGPU::COPY)) - .addReg(DestReg, Flags, RI.getSubRegFromChannel(i)) - .addReg(SubReg); + llvm_unreachable("VGPR spilling not supported"); + } +} + +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + return 2; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, + int Count) const { + while (Count > 0) { + int Arg; + if (Count >= 8) + Arg = 7; + else + Arg = Count - 1; + Count -= 8; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + .addImm(Arg); + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + SIMachineFunctionInfo *MFI = + MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + switch (MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned FrameIndex = MI->getOperand(2).getImm(); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + SIMachineFunctionInfo::SpilledReg Spill; + unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(), + &AMDGPU::SGPR_32RegClass, i); + Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), + MI->getOperand(0).getReg()) + .addReg(SubReg) + .addImm(Spill.Lane + i); + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + SIMachineFunctionInfo::SpilledReg Spill; + unsigned FrameIndex = MI->getOperand(2).getImm(); + unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(MI->getOperand(1).getReg()) + .addImm(Spill.Lane + i); } + MI->eraseFromParent(); + break; } + } + return true; } MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, @@ -247,18 +360,18 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg()) - return 0; + return nullptr; // Cannot commute VOP2 if src0 is SGPR. if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() && RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg()))) - return 0; + return nullptr; if (!MI->getOperand(2).isReg()) { // XXX: Commute instructions with FPImm operands if (NewMI || MI->getOperand(2).isFPImm() || (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { - return 0; + return nullptr; } // XXX: Commute VOP3 instructions with abs and neg set. @@ -267,7 +380,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, AMDGPU::OpName::abs)).getImm() || MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::neg)).getImm())) - return 0; + return nullptr; unsigned Reg = MI->getOperand(1).getReg(); unsigned SubReg = MI->getOperand(1).getSubReg(); @@ -516,6 +629,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; case AMDGPU::COPY: return AMDGPU::COPY; case AMDGPU::PHI: return AMDGPU::PHI; + case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; @@ -536,6 +650,23 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; + case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; + case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; + case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; + case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; + case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; + case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; + case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; + case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; } } @@ -559,6 +690,8 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: return RI.hasVGPRs(getOpRegClass(MI, 0)); default: return RI.hasVGPRs(getOpRegClass(MI, OpNo)); @@ -737,11 +870,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } - // Legalize REG_SEQUENCE + // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL; + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || + MI->getOpcode() == AMDGPU::PHI) { + const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { if (!MI->getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) @@ -774,13 +908,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + MachineBasicBlock *InsertBB; + MachineBasicBlock::iterator Insert; + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + InsertBB = MI->getParent(); + Insert = MI; + } else { + // MI is a PHI instruction. + InsertBB = MI->getOperand(i + 1).getMBB(); + Insert = InsertBB->getFirstTerminator(); + } + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) .addOperand(MI->getOperand(i)); MI->getOperand(i).setReg(DstReg); } } + // Legalize INSERT_SUBREG + // src0 must have the same register class as dst + if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned Src0 = MI->getOperand(1).getReg(); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); + if (DstRC != Src0RC) { + MachineBasicBlock &MBB = *MI->getParent(); + unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI->getOperand(1).setReg(NewSrc0); + } + return; + } + // Legalize MUBUF* instructions // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. @@ -886,6 +1047,72 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { + MachineBasicBlock *MBB = MI->getParent(); + switch (MI->getOpcode()) { + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: + unsigned NewOpcode = getVALUOp(*MI); + unsigned RegOffset; + unsigned ImmOffset; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); + ImmOffset = 0; + } else { + assert(MI->getOperand(2).isImm()); + // SMRD instructions take a dword offsets and MUBUF instructions + // take a byte offset. + ImmOffset = MI->getOperand(2).getImm() << 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } + + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) + .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) + .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + } else { + MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + } + MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + } +} + void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector<MachineInstr *, 128> Worklist; Worklist.push_back(&TopInst); @@ -895,8 +1122,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { MachineBasicBlock *MBB = Inst->getParent(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Opcode = Inst->getOpcode(); + unsigned NewOpcode = getVALUOp(*Inst); + // Handle some special cases - switch(Inst->getOpcode()) { + switch (Opcode) { + default: + if (isSMRD(Inst->getOpcode())) { + moveSMRDToVALU(Inst, MRI); + } + break; case AMDGPU::S_MOV_B64: { DebugLoc DL = Inst->getDebugLoc(); @@ -947,7 +1182,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { llvm_unreachable("Moving this op to VALU not implemented"); } - unsigned NewOpcode = getVALUOp(*Inst); if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. @@ -968,27 +1202,52 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->RemoveOperand(i); } - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); - } + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + Inst->addOperand(Inst->getOperand(1)); + Inst->getOperand(1).ChangeToImmediate(0); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Size)); + + // XXX - Other pointless operands. There are 4, but it seems you only need + // 3 to not hit an assertion later in MCInstLower. + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); } - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } + addDescImplicitUseDef(NewDesc, Inst); + + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + Inst->RemoveOperand(2); // Remove old immediate. + Inst->addOperand(Inst->getOperand(1)); + Inst->getOperand(1).ChangeToImmediate(0); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Offset)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); } - legalizeOperands(Inst); - // Update the destination register class. + const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - switch (Inst->getOpcode()) { + switch (Opcode) { // For target instructions, getOpRegClass just returns the virtual // register class associated with the operand, so we need to find an // equivalent VGPR register class in order to move the instruction to the @@ -996,6 +1255,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::COPY: case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: if (RI.hasVGPRs(NewDstRC)) continue; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); @@ -1010,6 +1270,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); + // Legalize the operands + legalizeOperands(Inst); + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), E = MRI.use_end(); I != E; ++I) { MachineInstr &UseMI = *I->getParent(); @@ -1097,6 +1360,24 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist, Worklist.push_back(HiHalf); } +void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, + MachineInstr *Inst) const { + // Add the implict and explicit register definitions. + if (NewDesc.ImplicitUses) { + for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { + unsigned Reg = NewDesc.ImplicitUses[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); + } + } + + if (NewDesc.ImplicitDefs) { + for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { + unsigned Reg = NewDesc.ImplicitDefs[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } + } +} + MachineInstrBuilder SIInstrInfo::buildIndirectWrite( MachineBasicBlock *MBB, MachineBasicBlock::iterator I, diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index c537038..7b31a81 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -47,49 +47,52 @@ private: void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist, MachineInstr *Inst, unsigned Opcode) const; + void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; public: explicit SIInstrInfo(AMDGPUTargetMachine &tm); - const SIRegisterInfo &getRegisterInfo() const { + const SIRegisterInfo &getRegisterInfo() const override { return RI; } - virtual void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + const TargetRegisterInfo *TRI) const override; + + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; unsigned commuteOpcode(unsigned Opcode) const; - virtual MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI=false) const; + MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI=false) const override; bool isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA = 0) const; + AliasAnalysis *AA = nullptr) const; - virtual unsigned getIEQOpcode() const { + unsigned getIEQOpcode() const override { llvm_unreachable("Unimplemented"); } MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const; - virtual bool isMov(unsigned Opcode) const; + unsigned DstReg, unsigned SrcReg) const override; + bool isMov(unsigned Opcode) const override; - virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; bool isDS(uint16_t Opcode) const; int isMIMG(uint16_t Opcode) const; int isSMRD(uint16_t Opcode) const; @@ -101,8 +104,8 @@ public: bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; - virtual bool verifyInstruction(const MachineInstr *MI, - StringRef &ErrInfo) const; + bool verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const override; bool isSALUInstr(const MachineInstr &MI) const; static unsigned getVALUOp(const MachineInstr &MI); @@ -136,32 +139,36 @@ public: /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; + /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. void moveToVALU(MachineInstr &MI) const; - virtual unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const; + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; - virtual const TargetRegisterClass *getIndirectAddrRegClass() const; + const TargetRegisterClass *getIndirectAddrRegClass() const override; - virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const; + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; - virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const; + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, unsigned SavReg, unsigned IndexReg) const; + + void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; }; namespace AMDGPU { @@ -169,6 +176,7 @@ namespace AMDGPU { int getVOPe64(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); + int getMCOpcode(uint16_t Opcode, unsigned Gen); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index e05ab65..2242e6d 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -7,23 +7,25 @@ // //===----------------------------------------------------------------------===// +// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum +// in AMDGPUMCInstLower.h +def SISubtarget { + int NONE = -1; + int SI = 0; +} + //===----------------------------------------------------------------------===// // SI DAG Nodes //===----------------------------------------------------------------------===// -// SMRD takes a 64bit memory address and can only add an 32bit offset -def SIadd64bit32bit : SDNode<"ISD::ADD", - SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]> ->; - def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>, + SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, [SDNPMayLoad, SDNPMemOperand] >; def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTypeProfile<0, 13, - [SDTCisVT<0, i128>, // rsrc(SGPR) + [SDTCisVT<0, v4i32>, // rsrc(SGPR) SDTCisVT<1, iAny>, // vdata(VGPR) SDTCisVT<2, i32>, // num_channels(imm) SDTCisVT<3, i32>, // vaddr(VGPR) @@ -41,13 +43,13 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", >; def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", - SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>, + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, SDTCisVT<3, i32>]> >; class SDSample<string opcode> : SDNode <opcode, SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>, - SDTCisVT<3, i128>, SDTCisVT<4, i32>]> + SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> >; def SIsample : SDSample<"AMDGPUISD::SAMPLE">; @@ -111,14 +113,17 @@ def IMM16bit : PatLeaf <(imm), [{return isUInt<16>(N->getZExtValue());}] >; +def IMM32bit : PatLeaf <(imm), + [{return isUInt<32>(N->getZExtValue());}] +>; + def mubuf_vaddr_offset : PatFrag< (ops node:$ptr, node:$offset, node:$imm_offset), (add (add node:$ptr, node:$offset), node:$imm_offset) >; class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ - return - (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0; + return isInlineImmediate(N); }]>; class SGPRImm <dag frag> : PatLeaf<frag, [{ @@ -138,7 +143,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{ }]>; def FRAMEri32 : Operand<iPTR> { - let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index); + let MIOperandInfo = (ops i32:$ptr, i32imm:$index); } //===----------------------------------------------------------------------===// @@ -197,15 +202,17 @@ class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < opName#" $dst, $src0, $src1", pattern >; -class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC < - op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; -class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC < - op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern ->; +class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt, + string opName, PatLeaf cond> : SOPC < + op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), + opName#" $dst, $src0, $src1", []>; + +class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper<op, SSrc_32, i32, opName, cond>; + +class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper<op, SSrc_64, i64, opName, cond>; class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK < op, (outs SReg_32:$dst), (ins i16imm:$src0), @@ -221,7 +228,7 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass, RegisterClass dstClass> { def _IMM : SMRD < op, 1, (outs dstClass:$dst), - (ins baseClass:$sbase, i32imm:$offset), + (ins baseClass:$sbase, u32imm:$offset), asm#" $dst, $sbase, $offset", [] >; @@ -245,6 +252,28 @@ class VOP2_REV <string revOp, bit isOrig> { bit IsOrig = isOrig; } +class SIMCInstr <string pseudo, int subtarget> { + string PseudoInstr = pseudo; + int Subtarget = subtarget; +} + +multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern, + string opName> { + + def "" : InstSI <outs, ins, "", pattern>, VOP <opName>, + SIMCInstr<OpName, SISubtarget.NONE> { + let isPseudo = 1; + } + + def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>; + +} + +// This must always be right before the operand being input modified. +def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printOperandAndMods"; +} + multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src, string opName, list<dag> pattern> { @@ -256,10 +285,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src, def _e64 : VOP3 < {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, (outs drc:$dst), - (ins src:$src0, - i32imm:$abs, i32imm:$clamp, - i32imm:$omod, i32imm:$neg), - opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", [] + (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod), + opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [] >, VOP <opName> { let src1 = SIOperand.ZERO; let src2 = SIOperand.ZERO; @@ -288,10 +315,10 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, def _e64 : VOP3 < {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, (outs vrc:$dst), - (ins arc:$src0, arc:$src1, - i32imm:$abs, i32imm:$clamp, - i32imm:$omod, i32imm:$neg), - opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", [] + (ins InputMods:$src0_modifiers, arc:$src0, + InputMods:$src1_modifiers, arc:$src1, + i32imm:$clamp, i32imm:$omod), + opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { let src2 = SIOperand.ZERO; } @@ -316,10 +343,10 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern, def _e64 : VOP3b < {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, (outs VReg_32:$dst), - (ins VSrc_32:$src0, VSrc_32:$src1, - i32imm:$abs, i32imm:$clamp, - i32imm:$omod, i32imm:$neg), - opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", [] + (ins InputMods: $src0_modifiers, VSrc_32:$src0, + InputMods:$src1_modifiers, VSrc_32:$src1, + i32imm:$clamp, i32imm:$omod), + opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { let src2 = SIOperand.ZERO; /* the VOP2 variant puts the carry out into VCC, the VOP3 variant @@ -340,15 +367,16 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, def _e64 : VOP3 < {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, (outs SReg_64:$dst), - (ins arc:$src0, arc:$src1, - InstFlag:$abs, InstFlag:$clamp, - InstFlag:$omod, InstFlag:$neg), - opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", + (ins InputMods:$src0_modifiers, arc:$src0, + InputMods:$src1_modifiers, arc:$src1, + InstFlag:$clamp, InstFlag:$omod), + opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>, [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))] ) >, VOP <opName> { let src2 = SIOperand.ZERO; + let src2_modifiers = 0; } } @@ -360,12 +388,13 @@ multiclass VOPC_64 <bits<8> op, string opName, ValueType vt = untyped, PatLeaf cond = COND_NULL> : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>; -class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 < +multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < op, (outs VReg_32:$dst), - (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, - InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern ->, VOP <opName>; + (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers, + VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2, + InstFlag:$clamp, InstFlag:$omod), + opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName +>; class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), @@ -374,10 +403,9 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 < >, VOP <opName> { let src2 = SIOperand.ZERO; - let abs = 0; + let src0_modifiers = 0; let clamp = 0; let omod = 0; - let neg = 0; } class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < @@ -403,7 +431,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> : class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < op, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, i16imm:$offset), + (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset), asm#" $vdst, $addr, $offset, [M0]", []> { let data0 = 0; @@ -415,7 +443,7 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < op, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, i8imm:$offset0, i8imm:$offset1), + (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1), asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]", []> { let data0 = 0; @@ -427,7 +455,7 @@ class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i16imm:$offset), + (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset), asm#" $addr, $data0, $offset [M0]", []> { let data1 = 0; @@ -439,7 +467,7 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i8imm:$offset0, i8imm:$offset1), + (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1), asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]", []> { let mayStore = 1; @@ -450,7 +478,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i16imm:$offset), + (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset), asm#" $vdst, $addr, $data0, $offset, [M0]", []> { @@ -462,7 +490,7 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs), - (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," @@ -481,7 +509,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { let offen = 0, idxen = 0 in { def _OFFSET : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_32:$vaddr, - i16imm:$offset, SSrc_32:$soffset, i1imm:$glc, + u16imm:$offset, SSrc_32:$soffset, i1imm:$glc, i1imm:$slc, i1imm:$tfe), asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; } @@ -497,7 +525,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { let offen = 0, idxen = 1 in { def _IDXEN : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_32:$vaddr, - i16imm:$offset, SSrc_32:$soffset, i1imm:$glc, + u16imm:$offset, SSrc_32:$soffset, i1imm:$glc, i1imm:$slc, i1imm:$tfe), asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; } @@ -513,7 +541,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { def _ADDR64 : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset), + (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), asm#" $vdata, $srsrc + $vaddr + $offset", []>; } } @@ -521,7 +549,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - i16imm:$offset), + u16imm:$offset), name#" $vdata, $srsrc + $vaddr + $offset", []> { @@ -542,7 +570,7 @@ class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs regClass:$dst), - (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," @@ -677,4 +705,12 @@ def isDS : InstrMapping { let ValueCols = [["8"]]; } +def getMCOpcode : InstrMapping { + let FilterClass = "SIMCInstr"; + let RowFields = ["PseudoInstr"]; + let ColFields = ["Subtarget"]; + let KeyCol = [!cast<string>(SISubtarget.NONE)]; + let ValueCols = [[!cast<string>(SISubtarget.SI)]]; +} + include "SIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 5232139..500fa78 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -32,9 +32,56 @@ def isSI : Predicate<"Subtarget.getGeneration() " def isCI : Predicate<"Subtarget.getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isCFDepth0 : Predicate<"isCFDepth0()">; + def WAIT_FLAG : InstFlag<"printWaitFlag">; -let Predicates = [isSI] in { +let SubtargetPredicate = isSI in { +let OtherPredicates = [isCFDepth0] in { + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 1 in { + +// We are using the SGPR_32 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SGPR_32 register class does not include M0 +// and writing to M0 from an SMRD instruction will hang the GPU. +defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>; + +defm S_BUFFER_LOAD_DWORD : SMRD_Helper < + 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32 +>; + +defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < + 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64 +>; + +defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < + 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128 +>; + +defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < + 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256 +>; + +defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < + 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512 +>; + +} // mayLoad = 1 + +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; + +//===----------------------------------------------------------------------===// +// SOP1 Instructions +//===----------------------------------------------------------------------===// let neverHasSideEffects = 1 in { @@ -45,7 +92,10 @@ def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>; def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>; } // End isMoveImm = 1 -def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>; +def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", + [(set i32:$dst, (not i32:$src0))] +>; + def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>; def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; @@ -65,8 +115,13 @@ def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; -//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>; -//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>; +def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", + [(set i32:$dst, (sext_inreg i32:$src0, i8))] +>; +def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", + [(set i32:$dst, (sext_inreg i32:$src0, i16))] +>; + ////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>; ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; @@ -99,6 +154,150 @@ def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>; def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>; def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>; def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; + +//===----------------------------------------------------------------------===// +// SOP2 Instructions +//===----------------------------------------------------------------------===// + +let Defs = [SCC] in { // Carry out goes to SCC +let isCommutable = 1 in { +def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; +def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", + [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] +>; +} // End isCommutable = 1 + +def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; +def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", + [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] +>; + +let Uses = [SCC] in { // Carry in comes from SCC +let isCommutable = 1 in { +def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", + [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End isCommutable = 1 + +def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", + [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End Uses = [SCC] +} // End Defs = [SCC] + +def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", + [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] +>; +def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", + [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] +>; +def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", + [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] +>; +def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", + [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] +>; + +def S_CSELECT_B32 : SOP2 < + 0x0000000a, (outs SReg_32:$dst), + (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", + [] +>; + +def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; + +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", + [(set i32:$dst, (and i32:$src0, i32:$src1))] +>; + +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", + [(set i64:$dst, (and i64:$src0, i64:$src1))] +>; + +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", + [(set i32:$dst, (or i32:$src0, i32:$src1))] +>; + +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", + [(set i64:$dst, (or i64:$src0, i64:$src1))] +>; + +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", + [(set i32:$dst, (xor i32:$src0, i32:$src1))] +>; + +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", + [(set i64:$dst, (xor i64:$src0, i64:$src1))] +>; +def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; +def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>; +def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>; +def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>; +def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; +def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; +def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; +def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; +def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; +def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; + +// Use added complexity so these patterns are preferred to the VALU patterns. +let AddedComplexity = 1 in { + +def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", + [(set i32:$dst, (shl i32:$src0, i32:$src1))] +>; +def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64", + [(set i64:$dst, (shl i64:$src0, i32:$src1))] +>; +def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", + [(set i32:$dst, (srl i32:$src0, i32:$src1))] +>; +def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64", + [(set i64:$dst, (srl i64:$src0, i32:$src1))] +>; +def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", + [(set i32:$dst, (sra i32:$src0, i32:$src1))] +>; +def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64", + [(set i64:$dst, (sra i64:$src0, i32:$src1))] +>; + +} // End AddedComplexity = 1 + +def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; +def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; +def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; +def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; +def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; +def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; +def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; + +//===----------------------------------------------------------------------===// +// SOPC Instructions +//===----------------------------------------------------------------------===// + +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; + +//===----------------------------------------------------------------------===// +// SOPK Instructions +//===----------------------------------------------------------------------===// + def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>; def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>; @@ -147,6 +346,108 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; //def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>; //def EXP : EXP_ <0x00000000, "EXP", []>; +} // End let OtherPredicates = [isCFDepth0] + +//===----------------------------------------------------------------------===// +// SOPP Instructions +//===----------------------------------------------------------------------===// + +def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>; + +let isTerminator = 1 in { + +def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", + [(IL_retflag)]> { + let SIMM16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; +} + +let isBranch = 1 in { +def S_BRANCH : SOPP < + 0x00000002, (ins brtarget:$target), "S_BRANCH $target", + [(br bb:$target)]> { + let isBarrier = 1; +} + +let DisableEncoding = "$scc" in { +def S_CBRANCH_SCC0 : SOPP < + 0x00000004, (ins brtarget:$target, SCCReg:$scc), + "S_CBRANCH_SCC0 $target", [] +>; +def S_CBRANCH_SCC1 : SOPP < + 0x00000005, (ins brtarget:$target, SCCReg:$scc), + "S_CBRANCH_SCC1 $target", + [] +>; +} // End DisableEncoding = "$scc" + +def S_CBRANCH_VCCZ : SOPP < + 0x00000006, (ins brtarget:$target, VCCReg:$vcc), + "S_CBRANCH_VCCZ $target", + [] +>; +def S_CBRANCH_VCCNZ : SOPP < + 0x00000007, (ins brtarget:$target, VCCReg:$vcc), + "S_CBRANCH_VCCNZ $target", + [] +>; + +let DisableEncoding = "$exec" in { +def S_CBRANCH_EXECZ : SOPP < + 0x00000008, (ins brtarget:$target, EXECReg:$exec), + "S_CBRANCH_EXECZ $target", + [] +>; +def S_CBRANCH_EXECNZ : SOPP < + 0x00000009, (ins brtarget:$target, EXECReg:$exec), + "S_CBRANCH_EXECNZ $target", + [] +>; +} // End DisableEncoding = "$exec" + + +} // End isBranch = 1 +} // End isTerminator = 1 + +let hasSideEffects = 1 in { +def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER", + [(int_AMDGPU_barrier_local)] +> { + let SIMM16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; + let mayLoad = 1; + let mayStore = 1; +} + +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16", + [] +>; +//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; +//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; +//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; + +let Uses = [EXEC] in { + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16", + [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] + > { + let DisableEncoding = "$m0"; + } +} // End Uses = [EXEC] + +//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; +//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; +//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; +//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; +//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; +} // End hasSideEffects + +//===----------------------------------------------------------------------===// +// VOPC Instructions +//===----------------------------------------------------------------------===// + let isCompare = 1 in { defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">; @@ -403,6 +704,10 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; } // End isCompare = 1 +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// + def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>; def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>; def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>; @@ -427,6 +732,9 @@ def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>; // TODO: DS_READ2ST64_B32, DS_READ2ST64_B64, // DS_WRITE2ST64_B32, DS_WRITE2ST64_B64 +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>; //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>; @@ -499,6 +807,11 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>; //def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>; //def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>; + +//===----------------------------------------------------------------------===// +// MTBUF Instructions +//===----------------------------------------------------------------------===// + //def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>; //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>; //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>; @@ -508,41 +821,10 @@ def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FOR def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>; def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>; -let mayLoad = 1 in { - -// We are using the SGPR_32 and not the SReg_32 register class for 32-bit -// SMRD instructions, because the SGPR_32 register class does not include M0 -// and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>; - -defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32 ->; - -defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64 ->; - -defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128 ->; - -defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256 ->; - -defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512 ->; - -} // mayLoad = 1 +//===----------------------------------------------------------------------===// +// MIMG Instructions +//===----------------------------------------------------------------------===// -//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">; defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">; //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; @@ -638,8 +920,12 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; //def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; -//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; let neverHasSideEffects = 1, isMoveImm = 1 in { defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>; @@ -691,8 +977,13 @@ defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32", //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; //defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; -//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; -//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; +defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64", + [(set i32:$dst, (fp_to_uint f64:$src0))] +>; +defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32", + [(set f64:$dst, (uint_to_fp i32:$src0))] +>; + defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", [(set f32:$dst, (AMDGPUfract f32:$src0))] >; @@ -756,6 +1047,11 @@ defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>; defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>; defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>; + +//===----------------------------------------------------------------------===// +// VINTRP Instructions +//===----------------------------------------------------------------------===// + def V_INTERP_P1_F32 : VINTRP < 0x00000000, (outs VReg_32:$dst), @@ -786,97 +1082,9 @@ def V_INTERP_MOV_F32 : VINTRP < let DisableEncoding = "$m0"; } -//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>; - -let isTerminator = 1 in { - -def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", - [(IL_retflag)]> { - let SIMM16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; -} - -let isBranch = 1 in { -def S_BRANCH : SOPP < - 0x00000002, (ins brtarget:$target), "S_BRANCH $target", - [(br bb:$target)]> { - let isBarrier = 1; -} - -let DisableEncoding = "$scc" in { -def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins brtarget:$target, SCCReg:$scc), - "S_CBRANCH_SCC0 $target", [] ->; -def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins brtarget:$target, SCCReg:$scc), - "S_CBRANCH_SCC1 $target", - [] ->; -} // End DisableEncoding = "$scc" - -def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins brtarget:$target, VCCReg:$vcc), - "S_CBRANCH_VCCZ $target", - [] ->; -def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins brtarget:$target, VCCReg:$vcc), - "S_CBRANCH_VCCNZ $target", - [] ->; - -let DisableEncoding = "$exec" in { -def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins brtarget:$target, EXECReg:$exec), - "S_CBRANCH_EXECZ $target", - [] ->; -def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins brtarget:$target, EXECReg:$exec), - "S_CBRANCH_EXECNZ $target", - [] ->; -} // End DisableEncoding = "$exec" - - -} // End isBranch = 1 -} // End isTerminator = 1 - -let hasSideEffects = 1 in { -def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER", - [(int_AMDGPU_barrier_local)] -> { - let SIMM16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; - let mayLoad = 1; - let mayStore = 1; -} - -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16", - [] ->; -//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; -//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; -//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; - -let Uses = [EXEC] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16", - [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] - > { - let DisableEncoding = "$m0"; - } -} // End Uses = [EXEC] - -//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; -//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; -//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; -//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; -//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; -//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; -} // End hasSideEffects +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), @@ -891,18 +1099,11 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))] ->; - -//f32 pattern for V_CNDMASK_B32_e64 -def : Pat < - (f32 (select i1:$src2, f32:$src1, f32:$src0)), - (V_CNDMASK_B32_e64 $src0, $src1, $src2) ->; - -def : Pat < - (i32 (trunc i64:$val)), - (EXTRACT_SUBREG $val, sub0) ->; +> { + let src0_modifiers = 0; + let src1_modifiers = 0; + let src2_modifiers = 0; +} def V_READLANE_B32 : VOP2 < 0x00000001, @@ -946,11 +1147,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", - [(set i32:$dst, (mul I24:$src0, I24:$src1))] + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))] >; //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", - [(set i32:$dst, (mul U24:$src0, U24:$src1))] + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))] >; //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; @@ -965,27 +1166,43 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; -defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>; -defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>; -defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>; -defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>; +defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", + [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>; +defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", + [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>; +defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", + [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>; +defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", + [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>; + +defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", + [(set i32:$dst, (srl i32:$src0, i32:$src1))] +>; -defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>; defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">; -defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>; +defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", + [(set i32:$dst, (sra i32:$src0, i32:$src1))] +>; defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">; let hasPostISelHook = 1 in { -defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>; +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", + [(set i32:$dst, (shl i32:$src0, i32:$src1))] +>; } defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">; -defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>; -defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>; -defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>; +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", + [(set i32:$dst, (and i32:$src0, i32:$src1))]>; +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", + [(set i32:$dst, (or i32:$src0, i32:$src1))] +>; +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", + [(set i32:$dst, (xor i32:$src0, i32:$src1))] +>; } // End isCommutable = 1 @@ -1001,14 +1218,18 @@ defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. -defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", [], VSrc_32>; -defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", [], VSrc_32>; +defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", + [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>; +defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", + [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>; defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32, "V_SUB_I32">; let Uses = [VCC] in { // Carry-in comes from VCC -defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", [], VReg_32>; -defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", [], VReg_32>; +defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", + [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>; +defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", + [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>; defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32, "V_SUBB_U32">; } // End Uses = [VCC] @@ -1023,63 +1244,51 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", >; ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// let neverHasSideEffects = 1 in { -def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; -def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>; -def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", - [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))] +defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; +defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", + [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] +>; +defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", + [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))] >; -def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", - [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))] +defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", + [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))] >; } // End neverHasSideEffects -def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; -def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; -def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; -def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; + +defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; +defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; +defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; +defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in { -def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", +defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>; -def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", +defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>; } -def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", +defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>; -defm : BFIPatterns <V_BFI_B32>; -def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", +defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))] >; def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] >; //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; -def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; -def : ROTRPattern <V_ALIGNBIT_B32>; +defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; -def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; -def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; +defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; +defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; ////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; ////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; ////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; @@ -1092,9 +1301,9 @@ def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; -def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; +defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64", @@ -1116,181 +1325,46 @@ def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; } // isCommutable = 1 -def : Pat < - (fadd f64:$src0, f64:$src1), - (V_ADD_F64 $src0, $src1, (i64 0)) ->; - -def : Pat < - (fmul f64:$src0, f64:$src1), - (V_MUL_F64 $src0, $src1, (i64 0)) ->; - def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; let isCommutable = 1 in { -def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; -def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; -def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; -def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; +defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; +defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; +defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; +defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; } // isCommutable = 1 -def : Pat < - (mul i32:$src0, i32:$src1), - (V_MUL_LO_I32 $src0, $src1, (i32 0)) ->; - -def : Pat < - (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1, (i32 0)) ->; - -def : Pat < - (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1, (i32 0)) ->; - -def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; +defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; -def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; -let Defs = [SCC] in { // Carry out goes to SCC -let isCommutable = 1 in { -def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; -def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", - [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] ->; -} // End isCommutable = 1 - -def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; -def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", - [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] ->; - -let Uses = [SCC] in { // Carry in comes from SCC -let isCommutable = 1 in { -def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", - [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End isCommutable = 1 - -def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", - [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End Uses = [SCC] -} // End Defs = [SCC] - -def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", - [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] ->; -def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", - [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] ->; -def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", - [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] ->; -def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", - [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] ->; - -def S_CSELECT_B32 : SOP2 < - 0x0000000a, (outs SReg_32:$dst), - (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", - [] ->; - -def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; - -def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", - [(set i32:$dst, (and i32:$src0, i32:$src1))] ->; - -def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", - [(set i64:$dst, (and i64:$src0, i64:$src1))] ->; - -def : Pat < - (i1 (and i1:$src0, i1:$src1)), - (S_AND_B64 $src0, $src1) ->; - -def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] ->; - -def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", - [(set i64:$dst, (or i64:$src0, i64:$src1))] ->; - -def : Pat < - (i1 (or i1:$src0, i1:$src1)), - (S_OR_B64 $src0, $src1) ->; +//===----------------------------------------------------------------------===// +// Pseudo Instructions +//===----------------------------------------------------------------------===// -def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] ->; +let isCodeGenOnly = 1, isPseudo = 1 in { -def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", - [(set i1:$dst, (xor i1:$src0, i1:$src1))] +def V_MOV_I1 : InstSI < + (outs VReg_1:$dst), + (ins i1imm:$src), + "", [(set i1:$dst, (imm:$src))] >; -def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; -def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>; -def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>; -def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>; -def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; -def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; -def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; -def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; -def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; -def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; - -// Use added complexity so these patterns are preferred to the VALU patterns. -let AddedComplexity = 1 in { -def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] ->; -def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] ->; -def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] ->; -def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] ->; -def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] ->; -def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] +def V_AND_I1 : InstSI < + (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", + [(set i1:$dst, (and i1:$src0, i1:$src1))] >; -} // End AddedComplexity = 1 - -def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; -def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; -def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; -def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; -def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; -def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; -def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; -//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; -def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; - -let isCodeGenOnly = 1, isPseudo = 1 in { - -def LOAD_CONST : AMDGPUShaderInst < - (outs GPRF32:$dst), - (ins i32imm:$src), - "LOAD_CONST $dst, $src", - [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))] +def V_OR_I1 : InstSI < + (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", + [(set i1:$dst, (or i1:$src0, i1:$src1))] >; // SI pseudo instructions. These are used by the CFG structurizer pass @@ -1301,19 +1375,19 @@ let mayLoad = 1, mayStore = 1, hasSideEffects = 1, let isBranch = 1, isTerminator = 1 in { -def SI_IF : InstSI < +def SI_IF: InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), - "SI_IF $dst, $vcc, $target", + "", [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] >; def SI_ELSE : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), - "SI_ELSE $dst, $src, $target", - [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> { - + "", + [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] +> { let Constraints = "$src = $dst"; } @@ -1370,7 +1444,7 @@ let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { let UseNamedOperandTable = 1 in { -def SI_RegisterLoad : AMDGPUShaderInst < +def SI_RegisterLoad : InstSI < (outs VReg_32:$dst, SReg_64:$temp), (ins FRAMEri32:$addr, i32imm:$chan), "", [] @@ -1379,7 +1453,7 @@ def SI_RegisterLoad : AMDGPUShaderInst < let mayLoad = 1; } -class SIRegStore<dag outs> : AMDGPUShaderInst < +class SIRegStore<dag outs> : InstSI < outs, (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan), "", [] @@ -1439,8 +1513,33 @@ def V_SUB_F64 : InstSI < } // end usesCustomInserter +multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { + + def _SAVE : InstSI < + (outs VReg_32:$dst), + (ins sgpr_class:$src, i32imm:$frame_idx), + "", [] + >; + + def _RESTORE : InstSI < + (outs sgpr_class:$dst), + (ins VReg_32:$src, i32imm:$frame_idx), + "", [] + >; + +} + +defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; +defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; +defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; +defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; + } // end IsCodeGenOnly, isPseudo +} // end SubtargetPredicate = SI + +let Predicates = [isSI] in { + def : Pat< (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0)) @@ -1453,7 +1552,7 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< - (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr), + (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) >; @@ -1470,40 +1569,116 @@ def : Pat < (V_SUB_F64 $src0, $src1) >; +//===----------------------------------------------------------------------===// +// SMRD Patterns +//===----------------------------------------------------------------------===// + +multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + + // 1. Offset as 8bit DWORD immediate + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), + (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; +defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>; +defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; +defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; + +// 1. Offset as 8bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) +>; + +// 2. Offset loaded in an 32bit SGPR +def : Pat < + (SIload_constant v4i32:$sbase, imm:$offset), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) +>; + +//===----------------------------------------------------------------------===// +// SOP2 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (or i64:$src0, i64:$src1), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0), + (EXTRACT_SUBREG i64:$src1, sub0)), sub0), + (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1), + (EXTRACT_SUBREG i64:$src1, sub1)), sub1) +>; + +class SextInReg <ValueType vt, int ShiftAmt> : Pat < + (sext_inreg i32:$src0, vt), + (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0)) +>; + +def : SextInReg <i8, 24>; +def : SextInReg <i16, 16>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ /* SIsample for simple 1D texture lookup */ def : Pat < - (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm), + (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm), + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT), + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY), + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleShadowPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW), + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleShadowArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY), + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; @@ -1692,8 +1867,6 @@ def : BitConvert <i64, v2i32, VReg_64>; def : BitConvert <v4f32, v4i32, VReg_128>; def : BitConvert <v4i32, v4f32, VReg_128>; -def : BitConvert <v4i32, i128, VReg_128>; -def : BitConvert <i128, v4i32, VReg_128>; def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, SReg_256>; @@ -1711,10 +1884,18 @@ def : BitConvert <v16f32, v16i32, VReg_512>; /********** Src & Dst modifiers **********/ /********** =================== **********/ +def FCLAMP_SI : AMDGPUShaderInst < + (outs VReg_32:$dst), + (ins VSrc_32:$src0), + "FCLAMP_SI $dst, $src0", + [] +> { + let usesCustomInserter = 1; +} + def : Pat < (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), - 0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) + (FCLAMP_SI f32:$src) >; /********** ================================ **********/ @@ -1733,14 +1914,32 @@ def : Pat < (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ >; +def FABS_SI : AMDGPUShaderInst < + (outs VReg_32:$dst), + (ins VSrc_32:$src0), + "FABS_SI $dst, $src0", + [] +> { + let usesCustomInserter = 1; +} + def : Pat < (fabs f32:$src), - (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) /* Clear sign bit */ + (FABS_SI f32:$src) >; +def FNEG_SI : AMDGPUShaderInst < + (outs VReg_32:$dst), + (ins VSrc_32:$src0), + "FNEG_SI $dst, $src0", + [] +> { + let usesCustomInserter = 1; +} + def : Pat < (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Toggle sign bit */ + (FNEG_SI f32:$src) >; /********** ================== **********/ @@ -1768,30 +1967,10 @@ def : Pat < >; def : Pat < - (i1 imm:$imm), - (S_MOV_B64 imm:$imm) ->; - -def : Pat < (i64 InlineImm<i64>:$imm), (S_MOV_B64 InlineImm<i64>:$imm) >; -// i64 immediates aren't supported in hardware, split it into two 32bit values -def : Pat < - (i64 imm:$imm), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0), - (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1) ->; - -def : Pat < - (f64 fpimm:$imm), - (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (V_MOV_B32_e32 (f32 (LO32f fpimm:$imm))), sub0), - (V_MOV_B32_e32 (f32 (HI32f fpimm:$imm))), sub1) ->; - /********** ===================== **********/ /********** Interpolation Paterns **********/ /********** ===================== **********/ @@ -1875,21 +2054,9 @@ class Ext32Pat <SDNode ext> : Pat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; -// 1. Offset as 8bit DWORD immediate +// Offset in an 32Bit VGPR def : Pat < - (SIload_constant i128:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) ->; - -// 2. Offset loaded in an 32bit SGPR -def : Pat < - (SIload_constant i128:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) ->; - -// 3. Offset in an 32Bit VGPR -def : Pat < - (SIload_constant i128:$sbase, i32:$voff), + (SIload_constant v4i32:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0) >; @@ -1904,18 +2071,44 @@ def : Pat < def : Pat < (int_SI_tid), (V_MBCNT_HI_U32_B32_e32 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0)) + (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0)) >; -/********** ================== **********/ -/********** VOP3 Patterns **********/ -/********** ================== **********/ +//===----------------------------------------------------------------------===// +// VOP3 Patterns +//===----------------------------------------------------------------------===// + +def : IMad24Pat<V_MAD_I32_I24>; +def : UMad24Pat<V_MAD_U32_U24>; def : Pat < - (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)), - (V_MAD_F32 $src0, $src1, $src2) + (fadd f64:$src0, f64:$src1), + (V_ADD_F64 $src0, $src1, (i64 0)) +>; + +def : Pat < + (fmul f64:$src0, f64:$src1), + (V_MUL_F64 $src0, $src1, (i64 0)) +>; + +def : Pat < + (mul i32:$src0, i32:$src1), + (V_MUL_LO_I32 $src0, $src1, (i32 0)) +>; + +def : Pat < + (mulhu i32:$src0, i32:$src1), + (V_MUL_HI_U32 $src0, $src1, (i32 0)) +>; + +def : Pat < + (mulhs i32:$src0, i32:$src1), + (V_MUL_HI_I32 $src0, $src1, (i32 0)) >; +defm : BFIPatterns <V_BFI_B32>; +def : ROTRPattern <V_ALIGNBIT_B32>; + /********** ======================= **********/ /********** Load/Store Patterns **********/ /********** ======================= **********/ @@ -1962,41 +2155,6 @@ def : Pat <(atomic_load_add_local i32:$ptr, i32:$val), def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val), (DS_SUB_U32_RTN 0, $ptr, $val, 0)>; -/********** ================== **********/ -/********** SMRD Patterns **********/ -/********** ================== **********/ - -multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { - - // 1. Offset as 8bit DWORD immediate - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), - (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)), - (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>; -defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; - //===----------------------------------------------------------------------===// // MUBUF Patterns //===----------------------------------------------------------------------===// @@ -2083,7 +2241,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe MUBUF bothen> { def : Pat < - (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset, + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc), @@ -2091,7 +2249,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe >; def : Pat < - (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset, + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc), @@ -2099,7 +2257,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe >; def : Pat < - (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset, + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 0, 1, imm:$glc, imm:$slc, imm:$tfe)), (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc), @@ -2107,7 +2265,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe >; def : Pat < - (vt (int_SI_buffer_load_dword i128:$rsrc, v2i32:$vaddr, i32:$soffset, + (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, imm, 1, 1, imm:$glc, imm:$slc, imm:$tfe)), (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc), @@ -2128,7 +2286,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_ // TBUFFER_STORE_FORMAT_*, addr64=0 class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat< - (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr, + (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, i32:$soffset, imm:$inst_offset, imm:$dfmt, imm:$nfmt, imm:$offen, imm:$idxen, imm:$glc, imm:$slc, imm:$tfe), @@ -2156,12 +2314,13 @@ defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64", defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64", [(set f64:$dst, (ffloor f64:$src0))] >; +defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", + [(set f64:$dst, (frint f64:$src0))] +>; -defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", []>; - -def V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>; -def V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>; -def V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>; +defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>; +defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>; +defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>; def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>; // XXX - Does this set VCC? @@ -2248,17 +2407,43 @@ def : Pat< >; //===----------------------------------------------------------------------===// -// Miscellaneous Patterns +// Conversion Patterns //===----------------------------------------------------------------------===// +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + +// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it +// might not be worth the effort, and will need to expand to shifts when +// fixing SGPR copies. + +// Handle sext_inreg in i64 def : Pat < - (i64 (trunc i128:$x)), + (i64 (sext_inreg i64:$src, i1)), (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (i32 (EXTRACT_SUBREG $x, sub0)), sub0), - (i32 (EXTRACT_SUBREG $x, sub1)), sub1) + (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16 + (S_MOV_B32 -1), sub1) >; def : Pat < + (i64 (sext_inreg i64:$src, i8)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0), + (S_MOV_B32 -1), sub1) +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i16)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0), + (S_MOV_B32 -1), sub1) +>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Patterns +//===----------------------------------------------------------------------===// + +def : Pat < (i32 (trunc i64:$a)), (EXTRACT_SUBREG $a, sub0) >; diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index c2f8696..6601f2a 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -67,7 +67,7 @@ private: static const unsigned SkipThreshold = 12; static char ID; - const TargetRegisterInfo *TRI; + const SIRegisterInfo *TRI; const SIInstrInfo *TII; bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); @@ -92,11 +92,11 @@ private: public: SILowerControlFlowPass(TargetMachine &tm) : - MachineFunctionPass(ID), TRI(0), TII(0) { } + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const { + const char *getPassName() const override { return "SI Lower control flow instructions"; } @@ -427,7 +427,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); - TRI = MF.getTarget().getRegisterInfo(); + TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp new file mode 100644 index 0000000..738c90b --- /dev/null +++ b/lib/Target/R600/SILowerI1Copies.cpp @@ -0,0 +1,148 @@ +//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// i1 values are usually inserted by the CFG Structurize pass and they are +/// unique in that they can be copied from VALU to SALU registers. +/// This is not possible for any other value type. Since there are no +/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. +/// +//===----------------------------------------------------------------------===// +// + +#define DEBUG_TYPE "si-i1-copies" +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SILowerI1Copies : public MachineFunctionPass { +public: + static char ID; + +public: + SILowerI1Copies() : MachineFunctionPass(ID) { + initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnMachineFunction(MachineFunction &MF) override; + + virtual const char *getPassName() const override { + return "SI Lower il Copies"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SILowerI1Copies::ID = 0; + +char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; + +FunctionPass *llvm::createSILowerI1CopiesPass() { + return new SILowerI1Copies(); +} + +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + MF.getTarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo(); + std::vector<unsigned> I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (MI.getOpcode() == AMDGPU::V_MOV_I1) { + I1Defs.push_back(MI.getOperand(0).getReg()); + MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); + continue; + } + + if (MI.getOpcode() == AMDGPU::V_AND_I1) { + I1Defs.push_back(MI.getOperand(0).getReg()); + MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32)); + continue; + } + + if (MI.getOpcode() == AMDGPU::V_OR_I1) { + I1Defs.push_back(MI.getOperand(0).getReg()); + MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32)); + continue; + } + + if (MI.getOpcode() != AMDGPU::COPY || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg())) + continue; + + + const TargetRegisterClass *DstRC = + MRI.getRegClass(MI.getOperand(0).getReg()); + const TargetRegisterClass *SrcRC = + MRI.getRegClass(MI.getOperand(1).getReg()); + + if (DstRC == &AMDGPU::VReg_1RegClass && + TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { + I1Defs.push_back(MI.getOperand(0).getReg()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addOperand(MI.getOperand(0)) + .addImm(0) + .addImm(-1) + .addOperand(MI.getOperand(1)) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0); + MI.eraseFromParent(); + } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && + SrcRC == &AMDGPU::VReg_1RegClass) { + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) + .addOperand(MI.getOperand(0)) + .addImm(0) + .addOperand(MI.getOperand(1)) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0); + MI.eraseFromParent(); + } + } + } + + for (unsigned Reg : I1Defs) + MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass); + + return false; +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index ea04346..af60995 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -10,8 +10,11 @@ #include "SIMachineFunctionInfo.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #define MAX_LANES 64 @@ -26,21 +29,57 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PSInputAddr(0), SpillTracker() { } -static unsigned createLaneVGPR(MachineRegisterInfo &MRI) { - return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); +static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { + unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + + // We need to add this register as live out for the function, in order to + // have the live range calculated directly. + // + // When register spilling begins, we have already calculated the live + // live intervals for all the registers. Since we are spilling SGPRs to + // VGPRs, we need to update the Lane VGPR's live interval every time we + // spill or restore a register. + // + // Unfortunately, there is no good way to update the live interval as + // the TargetInstrInfo callbacks for spilling and restoring don't give + // us access to the live interval information. + // + // We are lucky, though, because the InlineSpiller calls + // LiveRangeEdit::calculateRegClassAndHint() which iterates through + // all the new register that have been created when restoring a register + // and calls LiveIntervals::getInterval(), which creates and computes + // the live interval for the newly created register. However, once this + // live intervals is created, it doesn't change and since we usually reuse + // the Lane VGPR multiple times, this means any uses after the first aren't + // added to the live interval. + // + // To work around this, we add Lane VGPRs to the functions live out list, + // so that we can guarantee its live range will cover all of its uses. + + for (MachineBasicBlock &MBB : *MF) { + if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) { + MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true)); + return VGPR; + } + } + MF->getFunction()->getContext().emitError( + "Could not found S_ENGPGM instrtuction."); + return VGPR; } -unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) { +unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes( + MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) { + unsigned StartLane = CurrentLane; + CurrentLane += NumRegs; if (!LaneVGPR) { - LaneVGPR = createLaneVGPR(MRI); + LaneVGPR = createLaneVGPR(MRI, MF); } else { - CurrentLane++; - if (CurrentLane == MAX_LANES) { - CurrentLane = 0; - LaneVGPR = createLaneVGPR(MRI); + if (CurrentLane >= MAX_LANES) { + StartLane = CurrentLane = 0; + LaneVGPR = createLaneVGPR(MRI, MF); } } - return CurrentLane; + return StartLane; } void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex, diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 8dc82a0..96e619b 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -25,7 +25,7 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { - virtual void anchor(); + void anchor() override; public: struct SpilledReg { @@ -43,7 +43,12 @@ public: public: unsigned LaneVGPR; RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { } - unsigned getNextLane(MachineRegisterInfo &MRI); + /// \p NumRegs The number of consecutive registers what need to be spilled. + /// This function will ensure that all registers are stored in + /// the same VGPR. + /// \returns The lane to be used for storing the first register. + unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF, + unsigned NumRegs = 1); void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1); const SpilledReg& getSpilledReg(unsigned FrameIndex); bool programSpillsRegisters() { return !SpilledRegisters.empty(); } diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 6cef195..c72d549 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -71,13 +71,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::SReg_256RegClass }; - for (unsigned i = 0, e = sizeof(BaseClasses) / - sizeof(const TargetRegisterClass*); i != e; ++i) { - if (BaseClasses[i]->contains(Reg)) { - return BaseClasses[i]; + for (const TargetRegisterClass *BaseClass : BaseClasses) { + if (BaseClass->contains(Reg)) { + return BaseClass; } } - return NULL; + return nullptr; } bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const { @@ -113,7 +112,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { return &AMDGPU::VReg_512RegClass; } - return NULL; + return nullptr; } const TargetRegisterClass *SIRegisterInfo::getSubRegClass( @@ -129,3 +128,10 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::VGPR_32RegClass; } } + +unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, + const TargetRegisterClass *SubRC, + unsigned Channel) const { + unsigned Index = getHWRegIndex(Reg); + return SubRC->getRegister(Index + Channel); +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 8148f7f..36b4fcd 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -27,22 +27,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { SIRegisterInfo(AMDGPUTargetMachine &tm); - virtual BitVector getReservedRegs(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; - virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; /// \param RC is an AMDIL reg class. /// /// \returns the SI register class that is equivalent to \p RC. - virtual const TargetRegisterClass * - getISARegClass(const TargetRegisterClass *RC) const; + const TargetRegisterClass * + getISARegClass(const TargetRegisterClass *RC) const override; /// \brief get the register class of the specified type to use in the /// CFGStructurizer - virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; + const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - virtual unsigned getHWRegIndex(unsigned Reg) const; + unsigned getHWRegIndex(unsigned Reg) const override; /// \brief Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc. @@ -63,6 +63,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// be returned. const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + + /// \p Channel This is the register channel (e.g. a value from 0-16), not the + /// SubReg index. + /// \returns The sub-register of Reg that is in Channel. + unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, + unsigned Channel) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 65cf311..f1f01de 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, (add SGPR_64Regs, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; @@ -183,14 +183,16 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { let Size = 96; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>; +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>; + //===----------------------------------------------------------------------===// -// [SV]Src_* register classes, can have either an immediate or an register +// [SV]Src_(32|64) register classes, can have either an immediate or an register //===----------------------------------------------------------------------===// def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; @@ -201,3 +203,9 @@ def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +//===----------------------------------------------------------------------===// +// SGPR and VGPR register classes +//===----------------------------------------------------------------------===// + +def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, + (add VReg_128, SReg_128)>; diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index 9bf2caf..a0b6907 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -35,13 +35,13 @@ class SITypeRewriter : public FunctionPass, static char ID; Module *Mod; Type *v16i8; - Type *i128; + Type *v4i32; public: SITypeRewriter() : FunctionPass(ID) { } - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); - virtual const char *getPassName() const { + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "SI Type Rewriter"; } void visitLoadInst(LoadInst &I); @@ -56,7 +56,7 @@ char SITypeRewriter::ID = 0; bool SITypeRewriter::doInitialization(Module &M) { Mod = &M; v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - i128 = Type::getIntNTy(M.getContext(), 128); + v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); return false; } @@ -84,7 +84,8 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) { Type *ElemTy = PtrTy->getPointerElementType(); IRBuilder<> Builder(&I); if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2)); + Value *BitCast = Builder.CreateBitCast(Ptr, + PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); LoadInst *Load = Builder.CreateLoad(BitCast); SmallVector <std::pair<unsigned, MDNode*>, 8> MD; I.getAllMetadataOtherThanDebugLoc(MD); @@ -99,6 +100,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) { void SITypeRewriter::visitCallInst(CallInst &I) { IRBuilder<> Builder(&I); + SmallVector <Value*, 8> Args; SmallVector <Type*, 8> Types; bool NeedToReplace = false; @@ -107,10 +109,10 @@ void SITypeRewriter::visitCallInst(CallInst &I) { for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { Value *Arg = I.getArgOperand(i); if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, i128)); - Types.push_back(i128); + Args.push_back(Builder.CreateBitCast(Arg, v4i32)); + Types.push_back(v4i32); NeedToReplace = true; - Name = Name + ".i128"; + Name = Name + ".v4i32"; } else if (Arg->getType()->isVectorTy() && Arg->getType()->getVectorNumElements() == 1 && Arg->getType()->getVectorElementType() == @@ -144,12 +146,12 @@ void SITypeRewriter::visitCallInst(CallInst &I) { void SITypeRewriter::visitBitCast(BitCastInst &I) { IRBuilder<> Builder(&I); - if (I.getDestTy() != i128) { + if (I.getDestTy() != v4i32) { return; } if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) { - if (Op->getSrcTy() == i128) { + if (Op->getSrcTy() == v4i32) { I.replaceAllUsesWith(Op->getOperand(0)); I.eraseFromParent(); } |