diff options
author | Stephen Hines <srhines@google.com> | 2014-07-21 00:45:20 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-07-21 00:45:20 -0700 |
commit | c6a4f5e819217e1e12c458aed8e7b122e23a3a58 (patch) | |
tree | 81b7dd2bb4370a392f31d332a566c903b5744764 /lib/Target/R600 | |
parent | 19c6fbb3e8aaf74093afa08013134b61fa08f245 (diff) | |
download | external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.zip external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.gz external_llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.bz2 |
Update LLVM for rebase to r212749.
Includes a cherry-pick of:
r212948 - fixes a small issue with atomic calls
Change-Id: Ib97bd980b59f18142a69506400911a6009d9df18
Diffstat (limited to 'lib/Target/R600')
63 files changed, 3935 insertions, 2284 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 949fdfb..713fc4b 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -17,6 +17,7 @@ namespace llvm { class AMDGPUInstrPrinter; +class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; class MCAsmInfo; @@ -40,6 +41,7 @@ FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); @@ -47,14 +49,18 @@ void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; // Passes common to R600 and SI +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); /// \brief Creates an AMDGPU-specific Target Transformation Info pass. ImmutablePass * createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM); +void initializeSIFixSGPRLiveRangesPass(PassRegistry&); +extern char &SIFixSGPRLiveRangesID; + + extern Target TheAMDGPUTarget; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 2edc115..6ff9ab7 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -7,8 +7,7 @@ // //==-----------------------------------------------------------------------===// -// Include AMDIL TD files -include "AMDILBase.td" +include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// // Subtarget Features @@ -33,30 +32,25 @@ def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", "false", "Disable the if conversion pass">; -def FeatureFP64 : SubtargetFeature<"fp64", +def FeatureFP64 : SubtargetFeature<"fp64", "FP64", "true", - "Enable 64bit double precision operations">; + "Enable double precision operations">; def Feature64BitPtr : SubtargetFeature<"64BitPtr", "Is64bit", "true", - "Specify if 64bit addressing should be used.">; - -def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", - "Is32on64bit", - "false", - "Specify if 64bit sized pointers with 32bit addressing should be used.">; + "Specify if 64-bit addressing should be used">; def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", "R600ALUInst", "false", - "Older version of ALU instructions encoding.">; + "Older version of ALU instructions encoding">; def FeatureVertexCache : SubtargetFeature<"HasVertexCache", "HasVertexCache", "true", - "Specify use of dedicated vertex cache.">; + "Specify use of dedicated vertex cache">; def FeatureCaymanISA : SubtargetFeature<"caymanISA", "CaymanISA", @@ -87,28 +81,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes">; + class SubtargetFeatureGeneration <string Value, list<SubtargetFeature> Implies> : SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, Value#" GPU generation", Implies>; +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8]>; + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureLocalMemorySize0]>; def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64]>; + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -120,6 +126,10 @@ def AMDGPU : Target { let InstructionSet = AMDGPUInstrInfo; } +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + //===----------------------------------------------------------------------===// // Predicate helper class //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 170f479..a6e217b 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -19,6 +19,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" @@ -35,6 +36,24 @@ using namespace llvm; +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +static uint32_t getFPMode(MachineFunction &) { + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) | + FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE); +} static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, MCStreamer &Streamer) { @@ -92,6 +111,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), false); + OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer.emitRawComment( @@ -279,16 +302,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; - ProgInfo.CodeLen = CodeSize; - ProgInfo.NumSGPR = MaxSGPR; ProgInfo.NumVGPR = MaxVGPR; + ProgInfo.NumSGPR = MaxSGPR; + + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + ProgInfo.CodeLen = CodeSize; } void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned RsrcReg; switch (MFI->ShaderType) { default: // Fall through @@ -298,25 +332,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; } - OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); - unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks + // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { - // LDS is allocated in 128 dword blocks + // LDS is allocated in 128 dword blocks. LDSAlignShift = 9; } + unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; if (MFI->ShaderType == ShaderType::COMPUTE) { + OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + const uint32_t ComputePGMRSrc1 = + S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | + S_00B848_PRIORITY(KernelInfo.Priority) | + S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | + S_00B848_PRIV(KernelInfo.Priv) | + S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | + S_00B848_IEEE_MODE(KernelInfo.DebugMode) | + S_00B848_IEEE_MODE(KernelInfo.IEEEMode); + + OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); + OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + } else { + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); } + if (MFI->ShaderType == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 71adc9a..c1acb6e 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -25,13 +25,28 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - CodeLen(0), + NumVGPR(0), NumSGPR(0), - NumVGPR(0) {} + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + CodeLen(0) {} + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + + // Bonus information for debugging. uint64_t CodeLen; - unsigned NumSGPR; - unsigned NumVGPR; }; void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const; diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp deleted file mode 100644 index 91aeee2..0000000 --- a/lib/Target/R600/AMDGPUConvertToISA.cpp +++ /dev/null @@ -1,62 +0,0 @@ -//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This pass lowers AMDIL machine instructions to the appropriate -/// hardware instructions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUInstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" - -using namespace llvm; - -namespace { - -class AMDGPUConvertToISAPass : public MachineFunctionPass { - -private: - static char ID; - TargetMachine &TM; - -public: - AMDGPUConvertToISAPass(TargetMachine &tm) : - MachineFunctionPass(ID), TM(tm) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override {return "AMDGPU Convert to ISA";} - -}; - -} // End anonymous namespace - -char AMDGPUConvertToISAPass::ID = 0; - -FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) { - return new AMDGPUConvertToISAPass(tm); -} - -bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) { - const AMDGPUInstrInfo * TII = - static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo()); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - TII->convertToISA(MI, MF, MBB.findDebugLoc(I)); - } - } - return false; -} diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp index e7e90d3..9e8302e 100644 --- a/lib/Target/R600/AMDGPUFrameLowering.cpp +++ b/lib/Target/R600/AMDGPUFrameLowering.cpp @@ -83,7 +83,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); OffsetBytes += MFI->getObjectSize(i); - // Each regiter holds 4 bytes, so we must always align the offset to at + // Each register holds 4 bytes, so we must always align the offset to at // least 4 bytes, so that 2 frame objects won't share the same register. OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); } diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index f1f0bfa..b4d79e5 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -14,6 +14,7 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" #include "SIISelLowering.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -83,6 +84,11 @@ private: SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, + SDValue &ImmOffset) const; + + SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -211,51 +217,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. - case ISD::ADD: { + case ISD::ADD: + case ISD::SUB: { if (N->getValueType(0) != MVT::i64 || ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - SDLoc DL(N); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); - - SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub0); - SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub1); - - SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub0); - SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub1); - - SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - - SmallVector<SDValue, 8> AddLoArgs; - AddLoArgs.push_back(SDValue(Lo0, 0)); - AddLoArgs.push_back(SDValue(Lo1, 0)); - - SDNode *AddLo = CurDAG->getMachineNode( - isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32, - DL, VTList, AddLoArgs); - SDValue Carry = SDValue(AddLo, 1); - SDNode *AddHi = CurDAG->getMachineNode( - isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32, - DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); - - SDValue Args[5] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), - SDValue(AddLo,0), - Sub0, - SDValue(AddHi,0), - Sub1, - }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + return SelectADD_SUB_I64(N); } + case ISD::SCALAR_TO_VECTOR: + case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPURegisterInfo *TRI = @@ -264,7 +235,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); - assert(VT.getVectorElementType().bitsEq(MVT::i32)); + EVT EltVT = VT.getVectorElementType(); + assert(EltVT.bitsEq(MVT::i32)); if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { bool UseVReg = true; for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); @@ -305,7 +277,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // can't be bundled by our scheduler. switch(NumVectorElts) { case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } } @@ -313,8 +290,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32); if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, - VT.getVectorElementType(), + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), RegClass); } @@ -323,11 +299,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // 16 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1); + SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32); bool IsRegSeq = true; - for (unsigned i = 0; i < N->getNumOperands(); i++) { + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { // XXX: Why is this here? if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; @@ -337,6 +314,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); } + + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(N), EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); + } + } + if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), @@ -466,6 +457,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { PackedOffsetWidth); } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } } return SelectCode(N); } @@ -659,6 +653,129 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + bool IsAdd = (N->getOpcode() == ISD::ADD); + + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + + SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); + SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + + SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); + SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + + SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); + SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + + + unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32; + unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + + if (!isCFDepth0()) { + Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32; + CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32; + } + + SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); + SDValue Carry(AddLo, 1); + SDNode *AddHi + = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, + SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + + SDValue Args[5] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, + }; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); +} + +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + + SDValue Ops[] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + Zero, + Zero, + Zero, + Zero + }; + + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + +static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { + return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32, + Ptr), 0); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, + SDValue &Offset, + SDValue &ImmOffset) const { + SDLoc DL(Addr); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + if (isUInt<12>(C1->getZExtValue())) { + + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Ptr = wrapAddr64Rsrc(CurDAG, DL, N2); + Offset = N3; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + + // (add N0, C1) + Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));; + Offset = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + } + if (Addr.getOpcode() == ISD::ADD) { + // (add N0, N1) + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + Ptr = wrapAddr64Rsrc(CurDAG, DL, N0); + Offset = N1; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; + } + + // default case + Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64)); + Offset = Addr; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 6c443ea..0ada7a3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -16,9 +16,9 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -84,13 +84,37 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, #include "AMDGPUGenCallingConv.inc" +// Find a larger type to do a load / store of a vector with. +EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +// Type for a vector that will be loaded to. +EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, 32); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); - // Initialize target lowering borrowed from AMDIL - InitAMDILLowering(); + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); // We need to custom lower some of the intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -107,9 +131,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - // The hardware supports ROTR, but not ROTL - setOperationAction(ISD::ROTL, MVT::i32, Expand); - // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -118,6 +139,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); + AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -161,6 +185,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); @@ -202,29 +229,63 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + } - setOperationAction(ISD::FNEG, MVT::v2f32, Expand); - setOperationAction(ISD::FNEG, MVT::v4f32, Expand); + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::SUB, MVT::i64, Expand); + // GPU does not have divrem function for signed or unsigned. + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction. + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + // The hardware supports 32-bit ROTR, but not ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Custom); - setOperationAction(ISD::UDIVREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - static const MVT::SimpleValueType IntTypes[] = { + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; - for (MVT VT : IntTypes) { - //Expand the following operations for the current type by default + for (MVT VT : VectorIntTypes) { + // Expand the following operations for the current type by default. setOperationAction(ISD::ADD, VT, Expand); setOperationAction(ISD::AND, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); @@ -232,40 +293,93 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::MUL, VT, Expand); setOperationAction(ISD::OR, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); - setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); + // TODO: Implement custom UREM / SREM routines. + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::XOR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } - static const MVT::SimpleValueType FloatTypes[] = { + static const MVT::SimpleValueType FloatVectorTypes[] = { MVT::v2f32, MVT::v4f32 }; - for (MVT VT : FloatTypes) { + for (MVT VT : FloatVectorTypes) { setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT_CC); + + setSchedulingPreference(Sched::RegPressure); + setJumpIsExpensive(true); + + setSelectIsExpensive(false); + PredictableSelectIsExpensive = false; + + // There are no integer divide instructions, and these expand to a pretty + // large sequence of instructions. + setIntDivIsCheap(false); + setPow2DivIsCheap(false); + + // TODO: Investigate this when 64-bit divides are implemented. + addBypassSlowDiv(64, 32); + + // FIXME: Need to really handle these. + MaxStoresPerMemcpy = 4096; + MaxStoresPerMemmove = 4096; + MaxStoresPerMemset = 4096; } //===----------------------------------------------------------------------===// @@ -276,6 +390,23 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const { return MVT::i32; } +bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { + return true; +} + +// The backend supports 32 and 64 bit floating point immediates. +// FIXME: Why are we reporting vectors of FP immediates as legal? +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); +} + +// We don't want to shrink f64 / f32 constants. +bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); +} + bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) @@ -330,6 +461,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { return Src == MVT::i32 && Dest == MVT::i64; } +bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + return isZExtFree(Val.getValueType(), VT2); +} + bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -383,25 +518,28 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return SDValue(); } -SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) - const { +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: Op.getNode()->dump(); llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; - // AMDIL DAG lowering - case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::SREM: return LowerSREM(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); - case ISD::BRCOND: return LowerBRCOND(Op, DAG); - // AMDGPU DAG lowering case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::SREM: return LowerSREM(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FCEIL: return LowerFCEIL(Op, DAG); + case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); + case ISD::FRINT: return LowerFRINT(Op, DAG); + case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); } return Op; @@ -419,95 +557,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; - case ISD::UDIV: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM); - break; - } - case ISD::UREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM.getValue(1)); - break; - } - case ISD::UDIVREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, HalfVT); - SDValue zero = DAG.getConstant(0, HalfVT); - - //HiLo split - SDValue LHS = N->getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = N->getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Hi = zero; - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit - SDValue HBit; - if (halfBitWidth == 32 && Subtarget->hasBFE()) { - HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); - } else { - HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - } - - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); - } + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + if (!Node) + return; - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); - break; + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: { + SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); + if (Lowered.getNode()) + Results.push_back(Lowered); + return; } default: return; @@ -531,12 +597,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, SelectionDAG &DAG) const { const DataLayout *TD = getTargetMachine().getDataLayout(); SDLoc DL(InitPtr); + Type *InitTy = Init->getType(); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { - EVT VT = EVT::getEVT(CI->getType()); - PointerType *PtrTy = PointerType::get(CI->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(CI->getType())); + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { @@ -547,7 +615,6 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, TD->getPrefTypeAlignment(CFP->getType())); } - Type *InitTy = Init->getType(); if (StructType *ST = dyn_cast<StructType>(InitTy)) { const StructLayout *SL = TD->getStructLayout(ST); @@ -589,6 +656,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } + if (isa<UndefValue>(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); + } + Init->dump(); llvm_unreachable("Unhandled constant initializer"); } @@ -628,11 +703,19 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Size = TD->getTypeAllocSize(EltType); unsigned Alignment = TD->getPrefTypeAlignment(EltType); + MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); + const GlobalVariable *Var = cast<GlobalVariable>(GV); + if (!Var->hasInitializer()) { + // This has no use, but bugpoint will hit it. + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + const Constant *Init = Var->getInitializer(); - int FI = FrameInfo->CreateStackObject(Size, Alignment, false); - SDValue InitPtr = DAG.getFrameIndex(FI, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); SmallVector<SDNode*, 8> WorkList; for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), @@ -651,8 +734,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } DAG.UpdateNodeOperands(*I, Ops); } - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), - getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); } } } @@ -688,8 +770,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); - FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); - assert(FIN); + FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); @@ -705,26 +786,66 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { default: return Op; - case AMDGPUIntrinsic::AMDIL_abs: + case AMDGPUIntrinsic::AMDGPU_abs: + case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. return LowerIntrinsicIABS(Op, DAG); - case AMDGPUIntrinsic::AMDIL_exp: - return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_lrp: return LowerIntrinsicLRP(Op, DAG); - case AMDGPUIntrinsic::AMDIL_fraction: + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDIL_max: - return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_clamp: + case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, + Src0, Denominator, Numerator); + } + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_umax: return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDIL_min: - return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_imin: return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -748,6 +869,18 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_bfe_i32: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), @@ -771,8 +904,16 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDIL_round_nearest: + case AMDGPUIntrinsic::AMDGPU_brev: + return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. + return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); } } @@ -863,27 +1004,41 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const { LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); + EVT LoadVT = Op.getValueType(); EVT EltVT = Op.getValueType().getVectorElementType(); EVT PtrVT = Load->getBasePtr().getValueType(); + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); SmallVector<SDValue, 8> Loads; + SmallVector<SDValue, 8> Chains; + SDLoc SL(Op); for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); - Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - MachinePointerInfo(Load->getMemOperand()->getValue()), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->getAlignment())); + + SDValue NewLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + MachinePointerInfo(Load->getMemOperand()->getValue()), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->getAlignment()); + Loads.push_back(NewLoad.getValue(0)); + Chains.push_back(NewLoad.getValue(1)); } - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads); + + SDValue Ops[] = { + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) + }; + + return DAG.getMergeValues(Ops, SL); } SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { - StoreSDNode *Store = dyn_cast<StoreSDNode>(Op); + StoreSDNode *Store = cast<StoreSDNode>(Op); EVT MemVT = Store->getMemoryVT(); unsigned MemBits = MemVT.getSizeInBits(); @@ -981,7 +1136,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Load->getBasePtr(), MemVT, Load->getMemOperand()); - return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); + + SDValue Ops[] = { + DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), + ExtLoad32.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); } if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { @@ -995,7 +1156,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, MVT::i8, MMO); - return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); } // Lower loads constant address space global variable loads @@ -1003,11 +1170,12 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { isa<GlobalVariable>( GetUnderlyingObject(Load->getMemOperand()->getValue()))) { + SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), Load->getChain(), Ptr, DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); } @@ -1034,10 +1202,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemEltVT = MemVT.getScalarType(); if (ExtType == ISD::SEXTLOAD) { SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); } - return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); } SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -1097,6 +1276,251 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT INTTY; + MVT FLTTY; + if (!OVT.isVector()) { + INTTY = MVT::i32; + FLTTY = MVT::f32; + } else if (OVT.getVectorNumElements() == 2) { + INTTY = MVT::v2i32; + FLTTY = MVT::v2f32; + } else if (OVT.getVectorNumElements() == 4) { + INTTY = MVT::v4i32; + FLTTY = MVT::v4f32; + } + unsigned bitsize = OVT.getScalarType().getSizeInBits(); + // char|short jq = ia ^ ib; + SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + + // int ia = (int)LHS; + SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + + // int ib, (int)RHS; + SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb)); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, + DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); + + // int cv = fr >= fb; + SDValue cv; + if (INTTY == MVT::i32) { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } else { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, + DAG.getConstant(0, OVT)); + // dst = iq + jq; + iq = DAG.getSExtOrTrunc(iq, DL, OVT); + iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); + return iq; +} + +SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSDIV32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r0, r0, r1 + // ixor r10, r10, r11 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSelectCC(DL, + r0, DAG.getConstant(0, OVT), + DAG.getConstant(-1, OVT), + DAG.getConstant(0, OVT), + ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSelectCC(DL, + r1, DAG.getConstant(0, OVT), + DAG.getConstant(-1, OVT), + DAG.getConstant(0, OVT), + ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r0, r0, r1 + r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + + // ixor r10, r10, r11 + r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} + +SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType().getScalarType(); + + if (OVT == MVT::i64) + return LowerSDIV64(Op, DAG); + + if (OVT.getScalarType() == MVT::i32) + return LowerSDIV32(Op, DAG); + + if (OVT == MVT::i16 || OVT == MVT::i8) { + // FIXME: We should be checking for the masked bits. This isn't reached + // because i8 and i16 are not legal types. + return LowerSDIV24(Op, DAG); + } + + return SDValue(Op.getNode(), 0); +} + +SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSREM32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r20, r0, r1 + // umul r20, r20, r1 + // sub r0, r0, r20 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r20, r0, r1 + SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + + // umul r20, r20, r1 + r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); + + // sub r0, r0, r20 + r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} + +SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType(); + + if (OVT.getScalarType() == MVT::i64) + return LowerSREM64(Op, DAG); + + if (OVT.getScalarType() == MVT::i32) + return LowerSREM32(Op, DAG); + + return SDValue(Op.getNode(), 0); +} + SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -1201,6 +1625,177 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Ops, DL); } +SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Zero = DAG.getConstant(0, VT); + SDValue NegOne = DAG.getConstant(-1, VT); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); + SDValue RSign = LHSign; // Remainder sign is the same as LHS + + LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); + + LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); + + SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); + SDValue Rem = Div.getValue(1); + + Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); + + Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + const SDValue Zero = DAG.getConstant(0, MVT::i32); + const SDValue One = DAG.getConstant(1, MVT::i32); + + SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); + + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + // Extract the exponent. + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, MVT::i32), + DAG.getConstant(ExpBits, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, MVT::i32)); + + // Extract the sign bit. + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); + SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); + + // Extend back to to 64-bits. + SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + Zero, SignBit); + SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); + + SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); + const SDValue FractMask + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); + + SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); + SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + + SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); + + return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64); + SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); + SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); + + SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); + + APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); + + return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); +} + +SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src); + // if (src < 0.0 && src != result) + // result += -1.0. + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); + const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue S0 = Op.getOperand(0); @@ -1218,7 +1813,6 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); - } SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, @@ -1303,6 +1897,37 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, MVT::i32); } +SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + return SDValue(); + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + return DAG.getSExtOrTrunc(Mul, DL, VT); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -1310,34 +1935,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, switch(N->getOpcode()) { default: break; - case ISD::MUL: { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue Mul; - - // FIXME: Add support for 24-bit multiply with 64-bit output on SI. - if (VT.isVector() || VT.getSizeInBits() > 32) - break; - - if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { - N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); - } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { - N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); - } else { - break; - } - - // We need to use sext even for MUL_U24, because MUL_U24 is used - // for signed multiply of 8 and 16-bit types. - SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); - - return Reg; - } + case ISD::MUL: + return performMulCombine(N, DCI); case AMDGPUISD::MUL_I24: case AMDGPUISD::MUL_U24: { SDValue N0 = N->getOperand(0); @@ -1511,29 +2110,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(DIV_INF); NODE_NAME_CASE(RET_FLAG); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(FMAX) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) + NODE_NAME_CASE(BREV) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -1544,6 +2152,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SAMPLEB) NODE_NAME_CASE(SAMPLED) NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(CVT_F32_UBYTE0) + NODE_NAME_CASE(CVT_F32_UBYTE1) + NODE_NAME_CASE(CVT_F32_UBYTE2) + NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index d5d821d..98a92ad 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -42,10 +42,33 @@ private: SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. + + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + protected: + static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); + static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); /// \brief Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. @@ -61,6 +84,7 @@ protected: SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; @@ -87,10 +111,16 @@ public: bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; MVT getVectorIdxTy() const override; + bool isSelectSupported(SelectSupportKind) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool ShouldShrinkFPConstant(EVT VT) const override; + bool isLoadBitCastBeneficial(EVT, EVT) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -101,6 +131,7 @@ public: SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; @@ -128,38 +159,6 @@ public: SDValue Op, const SelectionDAG &DAG, unsigned Depth = 0) const override; - -// Functions defined in AMDILISelLowering.cpp -public: - bool getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, unsigned Intrinsic) const override; - - /// We want to mark f32/f64 floating point values as legal. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - - /// We don't want to shrink f64/f32 constants. - bool ShouldShrinkFPConstant(EVT VT) const override; - - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - -private: - void InitAMDILLowering(); - SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; - - SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, - unsigned BitsDiff, - SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; }; namespace AMDGPUISD { @@ -169,12 +168,15 @@ enum { FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication - DIV_INF, // Divide with infinity returned on zero divisor RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes DWORDADDR, FRACT, + CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. COS_HW, SIN_HW, FMAX, @@ -184,11 +186,23 @@ enum { SMIN, UMIN, URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. + BREV, // Reverse bits. MUL_U24, MUL_I24, MAD_U24, @@ -203,6 +217,21 @@ enum { SAMPLEB, SAMPLED, SAMPLEL, + + // These cvt_f32_ubyte* nodes need to remain consecutive and in order. + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index 1c3361a..fef5b8c 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -30,8 +30,8 @@ using namespace llvm; // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} -AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm) - : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { } +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { } const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { return RI; @@ -320,33 +320,11 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1); + Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } - -void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, - DebugLoc DL) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const AMDGPURegisterInfo & RI = getRegisterInfo(); - - for (unsigned i = 0; i < MI.getNumOperands(); i++) { - MachineOperand &MO = MI.getOperand(i); - // Convert dst regclass to one that is supported by the ISA - if (MO.isReg() && MO.isDef()) { - if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg()); - const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass); - - assert(newRegClass); - - MRI.setRegClass(MO.getReg(), newRegClass); - } - } - } -} - int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { switch (Channels) { default: return Opcode; diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index 74baf6b..95dc8c1 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -33,7 +33,7 @@ namespace llvm { -class AMDGPUTargetMachine; +class AMDGPUSubtarget; class MachineFunction; class MachineInstr; class MachineInstrBuilder; @@ -45,9 +45,9 @@ private: MachineBasicBlock &MBB) const; virtual void anchor(); protected: - TargetMachine &TM; + const AMDGPUSubtarget &ST; public: - explicit AMDGPUInstrInfo(TargetMachine &tm); + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; @@ -137,14 +137,6 @@ public: bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; // Helper functions that check the opcode for status information - bool isLoadInst(llvm::MachineInstr *MI) const; - bool isExtLoadInst(llvm::MachineInstr *MI) const; - bool isSWSExtLoadInst(llvm::MachineInstr *MI) const; - bool isSExtLoadInst(llvm::MachineInstr *MI) const; - bool isZExtLoadInst(llvm::MachineInstr *MI) const; - bool isAExtLoadInst(llvm::MachineInstr *MI) const; - bool isStoreInst(llvm::MachineInstr *MI) const; - bool isTruncStoreInst(llvm::MachineInstr *MI) const; bool isRegisterStore(const MachineInstr &MI) const; bool isRegisterLoad(const MachineInstr &MI) const; @@ -185,11 +177,6 @@ public: unsigned ValueReg, unsigned Address, unsigned OffsetReg) const = 0; - - /// \brief Convert the AMDIL MachineInstr to a supported ISA - /// MachineInstr - void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const; - /// \brief Build a MOV instruction. virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index f96dbb4..934d59d 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -29,11 +37,25 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; + // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] @@ -59,12 +81,38 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; + +def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", + SDTIntToFPOp, []>; + + // urecip - This operation is a helper for integer division, it returns the // result of 1 / a as a fractional unsigned integer. // out = (2^32 / a) + e // e is rounding error def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayLoad]>; @@ -92,6 +140,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; + // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, @@ -107,3 +157,22 @@ def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 80bdf5b..b86b781 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -49,6 +49,11 @@ def u8imm : Operand<i8> { let PrintMethod = "printU8ImmOperand"; } +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand<OtherVT>; + //===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// @@ -127,6 +132,21 @@ def COND_NULL : PatLeaf < // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ LoadSDNode *L = cast<LoadSDNode>(N); return L->getExtensionType() == ISD::ZEXTLOAD || @@ -232,26 +252,55 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; -def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value), - (atomic_load_add node:$ptr, node:$value), [{ - return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; -def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value), - (atomic_load_sub node:$ptr, node:$value), [{ - return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +class local_binary_atomic_op<SDNode atomic_op> : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; + +def atomic_swap_local : local_binary_atomic_op<atomic_swap>; +def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; +def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; +def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>; +def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>; +def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>; +def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>; +def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>; +def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>; +def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>; +def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; + def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; +def atomic_cmp_swap_32_local : + PatFrag<(ops node:$ptr, node:$cmp, node:$swap), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + +def atomic_cmp_swap_64_local : + PatFrag<(ops node:$ptr, node:$cmp, node:$swap), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + + class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP32_NEG_ONE = 0xbf800000; +int FP32_ONE = 0x3f800000; } def CONST : Constants; @@ -273,7 +322,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < @@ -363,7 +412,7 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < // BFI_INT patterns -multiclass BFIPatterns <Instruction BFI_INT> { +multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> { // Definition from ISA doc: // (y & x) | (z & ~x) @@ -379,6 +428,19 @@ multiclass BFIPatterns <Instruction BFI_INT> { (BFI_INT $x, $y, $z) >; + def : Pat < + (fcopysign f32:$src0, f32:$src1), + (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + >; + + def : Pat < + (f64 (fcopysign f64:$src0, f64:$src1)), + (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0), + (BFI_INT (LoadImm32 0x7fffffff), + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) + >; } // SHA-256 Ma patterns @@ -457,6 +519,23 @@ multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> { >; } +class RcpPat<Instruction RcpInst, ValueType vt> : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +multiclass RsqPat<Instruction RsqInst, ValueType vt> { + def : Pat < + (fdiv FP_ONE, (fsqrt vt:$src)), + (RsqInst $src) + >; + + def : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) + >; +} + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp index fab4a3b..58916a9 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.cpp +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp @@ -1,4 +1,4 @@ -//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===// +//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,7 +12,7 @@ // //===-----------------------------------------------------------------------===// -#include "AMDILIntrinsicInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Intrinsics.h" @@ -24,14 +24,12 @@ using namespace llvm; #include "AMDGPUGenIntrinsics.inc" #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) - : TargetIntrinsicInfo() { -} +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) + : TargetIntrinsicInfo() {} -std::string -AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, - unsigned int numTys) const { - static const char* const names[] = { +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { + static const char *const names[] = { #define GET_INTRINSIC_NAME_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_NAME_TABLE @@ -40,23 +38,23 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, if (IntrID < Intrinsic::num_intrinsics) { return nullptr; } - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics - && "Invalid intrinsic ID"); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + "Invalid intrinsic ID"); std::string Result(names[IntrID - Intrinsic::num_intrinsics]); return Result; } -unsigned int -AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { +unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, + unsigned Len) const { if (!StringRef(Name, Len).startswith("llvm.")) return 0; // All intrinsics start with 'llvm.' #define GET_FUNCTION_RECOGNIZER #include "AMDGPUGenIntrinsics.inc" #undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID - = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + AMDGPUIntrinsic::ID IntrinsicID = + (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { @@ -65,17 +63,15 @@ AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { return 0; } -bool -AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { - // Overload Table +bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table #define GET_INTRINSIC_OVERLOAD_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_OVERLOAD_TABLE } -Function* -AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned numTys) const { +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { llvm_unreachable("Not implemented"); } diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h index 924275a..5be68a2 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.h +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h @@ -1,4 +1,4 @@ -//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef AMDIL_INTRINSICS_H -#define AMDIL_INTRINSICS_H +#ifndef AMDGPU_INTRINSICINFO_H +#define AMDGPU_INTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -34,16 +34,15 @@ enum ID { class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(TargetMachine *tm); - std::string getName(unsigned int IntrId, Type **Tys = nullptr, - unsigned int numTys = 0) const override; - unsigned int lookupName(const char *Name, unsigned int Len) const override; - bool isOverloaded(unsigned int IID) const override; - Function *getDeclaration(Module *M, unsigned int ID, + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned numTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned int numTys = 0) const override; + unsigned numTys = 0) const override; }; } // end namespace llvm -#endif // AMDIL_INTRINSICS_H - +#endif // AMDGPU_INTRINSICINFO_H diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index 9ad5e72..d934676 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -18,18 +18,26 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - + def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; @@ -53,12 +61,27 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; +} + +// Legacy names for compatibility. +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; } let TargetPrefix = "TGSI", isTarget = 1 in { diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index b759495..ac82e88 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -15,6 +15,7 @@ #include "AMDGPUMCInstLower.h" #include "AMDGPUAsmPrinter.h" +#include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "R600InstrInfo.h" #include "SIInstrInfo.h" diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 2b7f1e3..58fe34d 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -14,9 +14,9 @@ namespace llvm { class AMDGPUSubtarget; -class MCInst; -class MCContext; class MachineInstr; +class MCContext; +class MCInst; class AMDGPUMCInstLower { diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp new file mode 100644 index 0000000..218750d --- /dev/null +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -0,0 +1,387 @@ +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates allocas by either converting them into vectors or +// by migrating them to local address space. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "amdgpu-promote-alloca" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteAlloca : public FunctionPass, + public InstVisitor<AMDGPUPromoteAlloca> { + + static char ID; + Module *Mod; + const AMDGPUSubtarget &ST; + int LocalMemAvailable; + +public: + AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), + LocalMemAvailable(0) { } + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + virtual const char *getPassName() const { + return "AMDGPU Promote Alloca"; + } + void visitAlloca(AllocaInst &I); +}; + +} // End anonymous namespace + +char AMDGPUPromoteAlloca::ID = 0; + +bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + + const FunctionType *FTy = F.getFunctionType(); + + LocalMemAvailable = ST.getLocalMemorySize(); + + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { + const Type *ParamTy = FTy->getParamType(i); + if (ParamTy->isPointerTy() && + ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemAvailable = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + break; + } + } + + if (LocalMemAvailable > 0) { + // Check how much local memory is being used by global objects + for (Module::global_iterator I = Mod->global_begin(), + E = Mod->global_end(); I != E; ++I) { + GlobalVariable *GV = I; + PointerType *GVTy = GV->getType(); + if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + for (Value::use_iterator U = GV->use_begin(), + UE = GV->use_end(); U != UE; ++U) { + Instruction *Use = dyn_cast<Instruction>(*U); + if (!Use) + continue; + if (Use->getParent()->getParent() == &F) + LocalMemAvailable -= + Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); + } + } + } + + LocalMemAvailable = std::max(0, LocalMemAvailable); + DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + + visit(F); + + return false; +} + +static VectorType *arrayTypeToVecType(const Type *ArrayTy) { + return VectorType::get(ArrayTy->getArrayElementType(), + ArrayTy->getArrayNumElements()); +} + +static Value* calculateVectorIndex(Value *Ptr, + std::map<GetElementPtrInst*, Value*> GEPIdx) { + if (isa<AllocaInst>(Ptr)) + return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); + + GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + + return GEPIdx[GEP]; +} + +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { + // FIXME we only support simple cases + if (GEP->getNumOperands() != 3) + return NULL; + + ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); + if (!I0 || !I0->isZero()) + return NULL; + + return GEP->getOperand(2); +} + +// Not an instruction handled below to turn into a vector. +// +// TODO: Check isTriviallyVectorizable for calls and handle other +// instructions. +static bool canVectorizeInst(Instruction *Inst) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + return true; + default: + return false; + } +} + +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + + // FIXME: There is no reason why we can't support larger arrays, we + // are just being conservative for now. + if (!AllocaTy->isArrayTy() || + AllocaTy->getArrayElementType()->isVectorTy() || + AllocaTy->getArrayNumElements() > 4) { + + DEBUG(dbgs() << " Cannot convert type to vector"); + return false; + } + + std::map<GetElementPtrInst*, Value*> GEPVectorIdx; + std::vector<Value*> WorkList; + for (User *AllocaUser : Alloca->users()) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); + if (!GEP) { + if (!canVectorizeInst(cast<Instruction>(AllocaUser))) + return false; + + WorkList.push_back(AllocaUser); + continue; + } + + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeInst(cast<Instruction>(GEPUser))) + return false; + + WorkList.push_back(GEPUser); + } + } + + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + + DEBUG(dbgs() << " Converting alloca to vector " + << *AllocaTy << " -> " << *VectorTy << '\n'); + + for (std::vector<Value*>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + Instruction *Inst = cast<Instruction>(*I); + IRBuilder<> Builder(Inst); + switch (Inst->getOpcode()) { + case Instruction::Load: { + Value *Ptr = Inst->getOperand(0); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Inst->replaceAllUsesWith(ExtractElement); + Inst->eraseFromParent(); + break; + } + case Instruction::Store: { + Value *Ptr = Inst->getOperand(1); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + Inst->getOperand(0), + Index); + Builder.CreateStore(NewVecValue, BitCast); + Inst->eraseFromParent(); + break; + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + break; + + default: + Inst->dump(); + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + } + return true; +} + +static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + for (User *User : Val->users()) { + if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + continue; + if (isa<CallInst>(User)) { + WorkList.push_back(User); + continue; + } + if (!User->getType()->isPointerTy()) + continue; + WorkList.push_back(User); + collectUsesWithPtrTypes(User, WorkList); + } +} + +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + IRBuilder<> Builder(&I); + + // First try to replace the alloca with a vector + Type *AllocaTy = I.getAllocatedType(); + + DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + if (tryPromoteAllocaToVector(&I)) + return; + + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + + // FIXME: This is the maximum work group size. We should try to get + // value from the reqd_work_group_size function attribute if it is + // available. + unsigned WorkGroupSize = 256; + int AllocaSize = WorkGroupSize * + Mod->getDataLayout()->getTypeAllocSize(AllocaTy); + + if (AllocaSize > LocalMemAvailable) { + DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + return; + } + + DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LocalMemAvailable -= AllocaSize; + + GlobalVariable *GV = new GlobalVariable( + *Mod, ArrayType::get(I.getAllocatedType(), 256), false, + GlobalValue::ExternalLinkage, 0, I.getName(), 0, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(Mod->getContext()), false); + AttributeSet AttrSet; + AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); + + Value *ReadLocalSizeY = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.y", FTy, AttrSet); + Value *ReadLocalSizeZ = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.z", FTy, AttrSet); + Value *ReadTIDIGX = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.x", FTy, AttrSet); + Value *ReadTIDIGY = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.y", FTy, AttrSet); + Value *ReadTIDIGZ = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.z", FTy, AttrSet); + + + Value *TCntY = Builder.CreateCall(ReadLocalSizeY); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); + Value *TIdX = Builder.CreateCall(ReadTIDIGX); + Value *TIdY = Builder.CreateCall(ReadTIDIGY); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + + std::vector<Value*> Indices; + Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); + Indices.push_back(TID); + + Value *Offset = Builder.CreateGEP(GV, Indices); + I.mutateType(Offset->getType()); + I.replaceAllUsesWith(Offset); + I.eraseFromParent(); + + std::vector<Value*> WorkList; + + collectUsesWithPtrTypes(Offset, WorkList); + + for (std::vector<Value*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Value *V = *i; + CallInst *Call = dyn_cast<CallInst>(V); + if (!Call) { + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + V->mutateType(NewTy); + continue; + } + + IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); + if (!Intr) { + std::vector<Type*> ArgTypes; + for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); + ArgIdx != ArgEnd; ++ArgIdx) { + ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); + } + Function *F = Call->getCalledFunction(); + FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, + F->isVarArg()); + Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, + F->getAttributes()); + Function *NewF = cast<Function>(C); + Call->setCalledFunction(NewF); + continue; + } + + Builder.SetInsertPoint(Intr); + switch (Intr->getIntrinsicID()) { + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // These intrinsics are for address space 0 only + Intr->eraseFromParent(); + continue; + case Intrinsic::memcpy: { + MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), + MemCpy->getLength(), MemCpy->getAlignment(), + MemCpy->isVolatile()); + Intr->eraseFromParent(); + continue; + } + case Intrinsic::memset: { + MemSetInst *MemSet = cast<MemSetInst>(Intr); + Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), + MemSet->getLength(), MemSet->getAlignment(), + MemSet->isVolatile()); + Intr->eraseFromParent(); + continue; + } + default: + Intr->dump(); + llvm_unreachable("Don't know how to promote alloca intrinsic use."); + } + } +} + +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { + return new AMDGPUPromoteAlloca(ST); +} diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 19927fa..3433280 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -17,9 +17,9 @@ using namespace llvm; -AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm) +AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st) : AMDGPUGenRegisterInfo(0), - TM(tm) + ST(st) { } //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index a7cba0d..4731595 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -25,27 +25,19 @@ namespace llvm { -class AMDGPUTargetMachine; +class AMDGPUSubtarget; class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - TargetMachine &TM; static const MCPhysReg CalleeSavedReg; + const AMDGPUSubtarget &ST; - AMDGPURegisterInfo(TargetMachine &tm); + AMDGPURegisterInfo(const AMDGPUSubtarget &st); BitVector getReservedRegs(const MachineFunction &MF) const override { assert(!"Unimplemented"); return BitVector(); } - /// \param RC is an AMDIL reg class. - /// - /// \returns The ISA reg class that is equivalent to \p RC. - virtual const TargetRegisterClass * getISARegClass( - const TargetRegisterClass * RC) const { - assert(!"Unimplemented"); return nullptr; - } - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { assert(!"Unimplemented"); return nullptr; } diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index f3b9932..b83c290 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -13,6 +13,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "SIInstrInfo.h" using namespace llvm; @@ -23,90 +25,42 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : - AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) { - InstrItins = getInstrItineraryForCPU(CPU); - - // Default card - StringRef GPU = CPU; - Is64bit = false; - HasVertexCache = false; - TexVTXClauseSize = 0; - Gen = AMDGPUSubtarget::R600; - FP64 = false; - CaymanISA = false; - EnableIRStructurizer = true; - EnableIfCvt = true; - WavefrontSize = 0; - CFALUBug = false; +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) : + AMDGPUGenSubtargetInfo(TT, GPU, FS), + DevName(GPU), + Is64bit(false), + DumpCode(false), + R600ALUInst(false), + HasVertexCache(false), + TexVTXClauseSize(0), + Gen(AMDGPUSubtarget::R600), + FP64(false), + CaymanISA(false), + EnableIRStructurizer(true), + EnableIfCvt(true), + WavefrontSize(0), + CFALUBug(false), + LocalMemorySize(0), + InstrItins(getInstrItineraryForCPU(GPU)) { ParseSubtargetFeatures(GPU, FS); - DevName = GPU; -} -bool -AMDGPUSubtarget::is64bit() const { - return Is64bit; -} -bool -AMDGPUSubtarget::hasVertexCache() const { - return HasVertexCache; -} -short -AMDGPUSubtarget::getTexVTXClauseSize() const { - return TexVTXClauseSize; -} -enum AMDGPUSubtarget::Generation -AMDGPUSubtarget::getGeneration() const { - return Gen; -} -bool -AMDGPUSubtarget::hasHWFP64() const { - return FP64; -} -bool -AMDGPUSubtarget::hasCaymanISA() const { - return CaymanISA; -} -bool -AMDGPUSubtarget::IsIRStructurizerEnabled() const { - return EnableIRStructurizer; -} -bool -AMDGPUSubtarget::isIfCvtEnabled() const { - return EnableIfCvt; -} -unsigned -AMDGPUSubtarget::getWavefrontSize() const { - return WavefrontSize; + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + InstrInfo.reset(new R600InstrInfo(*this)); + } else { + InstrInfo.reset(new SIInstrInfo(*this)); + } } -unsigned -AMDGPUSubtarget::getStackEntrySize() const { + +unsigned AMDGPUSubtarget::getStackEntrySize() const { assert(getGeneration() <= NORTHERN_ISLANDS); switch(getWavefrontSize()) { case 16: return 8; case 32: - if (hasCaymanISA()) - return 4; - else - return 8; + return hasCaymanISA() ? 4 : 8; case 64: return 4; default: llvm_unreachable("Illegal wavefront size."); } } -bool -AMDGPUSubtarget::hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; -} -bool -AMDGPUSubtarget::isTargetELF() const { - return false; -} - -std::string -AMDGPUSubtarget::getDeviceName() const { - return DevName; -} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 1b041d6..0c388b3 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -15,6 +15,7 @@ #ifndef AMDGPUSUBTARGET_H #define AMDGPUSUBTARGET_H #include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -27,6 +28,9 @@ namespace llvm { class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { + + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; + public: enum Generation { R600 = 0, @@ -40,42 +44,78 @@ public: private: std::string DevName; bool Is64bit; - bool Is32on64bit; bool DumpCode; bool R600ALUInst; bool HasVertexCache; short TexVTXClauseSize; - enum Generation Gen; + Generation Gen; bool FP64; bool CaymanISA; bool EnableIRStructurizer; bool EnableIfCvt; unsigned WavefrontSize; bool CFALUBug; + int LocalMemorySize; InstrItineraryData InstrItins; public: AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); - const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const AMDGPUInstrInfo *getInstrInfo() const { + return InstrInfo.get(); + } + + const InstrItineraryData &getInstrItineraryData() const { + return InstrItins; + } + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - bool is64bit() const; - bool hasVertexCache() const; - short getTexVTXClauseSize() const; - enum Generation getGeneration() const; - bool hasHWFP64() const; - bool hasCaymanISA() const; + bool is64bit() const { + return Is64bit; + } + + bool hasVertexCache() const { + return HasVertexCache; + } + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; + } + + Generation getGeneration() const { + return Gen; + } + + bool hasHWFP64() const { + return FP64; + } + + bool hasCaymanISA() const { + return CaymanISA; + } bool hasBFE() const { return (getGeneration() >= EVERGREEN); } + bool hasBFI() const { + return (getGeneration() >= EVERGREEN); + } + bool hasBFM() const { return hasBFE(); } + bool hasBCNT(unsigned Size) const { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + assert(Size == 64); + return (getGeneration() >= SOUTHERN_ISLANDS); + } + bool hasMulU24() const { return (getGeneration() >= EVERGREEN); } @@ -85,22 +125,48 @@ public: hasCaymanISA()); } - bool IsIRStructurizerEnabled() const; - bool isIfCvtEnabled() const; - unsigned getWavefrontSize() const; + bool IsIRStructurizerEnabled() const { + return EnableIRStructurizer; + } + + bool isIfCvtEnabled() const { + return EnableIfCvt; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + unsigned getStackEntrySize() const; - bool hasCFAluBug() const; + + bool hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } bool enableMachineScheduler() const override { return getGeneration() <= NORTHERN_ISLANDS; } // Helper functions to simplify if statements - bool isTargetELF() const; - std::string getDeviceName() const; - bool dumpCode() const { return DumpCode; } - bool r600ALUEncoding() const { return R600ALUInst; } + bool isTargetELF() const { + return false; + } + StringRef getDeviceName() const { + return DevName; + } + + bool dumpCode() const { + return DumpCode; + } + bool r600ALUEncoding() const { + return R600ALUInst; + } }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 174fdca..8aab944 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -80,10 +80,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, InstrItins(&Subtarget.getInstrItineraryData()) { // TLInfo uses InstrInfo so it must be initialized after. if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); TLInfo.reset(new R600TargetLowering(*this)); } else { - InstrInfo.reset(new SIInstrInfo(*this)); TLInfo.reset(new SITargetLowering(*this)); } setRequiresStructuredCFG(true); @@ -111,6 +109,7 @@ public: return nullptr; } + virtual void addCodeGenPrepare(); bool addPreISel() override; bool addInstSelector() override; bool addPreRegAlloc() override; @@ -136,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { PM.add(createAMDGPUTargetTransformInfoPass(this)); } +void AMDGPUPassConfig::addCodeGenPrepare() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + TargetPassConfig::addCodeGenPrepare(); +} + bool AMDGPUPassConfig::addPreISel() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); @@ -159,7 +165,6 @@ bool AMDGPUPassConfig::addInstSelector() { } bool AMDGPUPassConfig::addPreRegAlloc() { - addPass(createAMDGPUConvertToISAPass(*TM)); const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { @@ -169,6 +174,8 @@ bool AMDGPUPassConfig::addPreRegAlloc() { // SIFixSGPRCopies can generate a lot of duplicate instructions, // so we need to run MachineCSE afterwards. addPass(&MachineCSEID); + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID); } return false; } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 1287e13..3bb15be 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -17,8 +17,8 @@ #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "R600ISelLowering.h" #include "llvm/IR/DataLayout.h" @@ -30,7 +30,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { const DataLayout Layout; AMDGPUFrameLowering FrameLowering; AMDGPUIntrinsicInfo IntrinsicInfo; - std::unique_ptr<AMDGPUInstrInfo> InstrInfo; std::unique_ptr<AMDGPUTargetLowering> TLInfo; const InstrItineraryData *InstrItins; @@ -46,13 +45,13 @@ public: return &IntrinsicInfo; } const AMDGPUInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); + return getSubtargetImpl()->getInstrInfo(); } const AMDGPUSubtarget *getSubtargetImpl() const override { return &Subtarget; } const AMDGPURegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); + return &getInstrInfo()->getRegisterInfo(); } AMDGPUTargetLowering *getTargetLowering() const override { return TLInfo.get(); diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td deleted file mode 100644 index 5dcd478..0000000 --- a/lib/Target/R600/AMDILBase.td +++ /dev/null @@ -1,25 +0,0 @@ -//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Target-independent interfaces which we are implementing -//===----------------------------------------------------------------------===// - -include "llvm/Target/Target.td" - -// Dummy Instruction itineraries for pseudo instructions -def ALU_NULL : FuncUnit; -def NullALU : InstrItinClass; - -//===----------------------------------------------------------------------===// -// Register File, Calling Conv, Instruction Descriptions -//===----------------------------------------------------------------------===// - - -include "AMDILRegisterInfo.td" -include "AMDILInstrInfo.td" - diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp deleted file mode 100644 index 7cea803..0000000 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ /dev/null @@ -1,560 +0,0 @@ -//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief TargetLowering functions borrowed from AMDIL. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUISelLowering.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetOptions.h" - -using namespace llvm; -//===----------------------------------------------------------------------===// -// TargetLowering Implementation Help Functions End -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// TargetLowering Class Implementation Begins -//===----------------------------------------------------------------------===// -void AMDGPUTargetLowering::InitAMDILLowering() { - static const MVT::SimpleValueType types[] = { - MVT::i8, - MVT::i16, - MVT::i32, - MVT::f32, - MVT::f64, - MVT::i64, - MVT::v2i8, - MVT::v4i8, - MVT::v2i16, - MVT::v4i16, - MVT::v4f32, - MVT::v4i32, - MVT::v2f32, - MVT::v2i32, - MVT::v2f64, - MVT::v2i64 - }; - - static const MVT::SimpleValueType IntTypes[] = { - MVT::i8, - MVT::i16, - MVT::i32, - MVT::i64 - }; - - static const MVT::SimpleValueType FloatTypes[] = { - MVT::f32, - MVT::f64 - }; - - static const MVT::SimpleValueType VectorTypes[] = { - MVT::v2i8, - MVT::v4i8, - MVT::v2i16, - MVT::v4i16, - MVT::v4f32, - MVT::v4i32, - MVT::v2f32, - MVT::v2i32, - MVT::v2f64, - MVT::v2i64 - }; - - const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>(); - // These are the current register classes that are - // supported - - for (MVT VT : types) { - setOperationAction(ISD::SUBE, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::BRCOND, VT, Custom); - setOperationAction(ISD::BR_JT, VT, Expand); - setOperationAction(ISD::BRIND, VT, Expand); - // TODO: Implement custom UREM/SREM routines - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - if (VT != MVT::i64 && VT != MVT::v2i64) { - setOperationAction(ISD::SDIV, VT, Custom); - } - } - for (MVT VT : FloatTypes) { - // IL does not have these operations for floating point types - setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); - setOperationAction(ISD::SETOLT, VT, Expand); - setOperationAction(ISD::SETOGE, VT, Expand); - setOperationAction(ISD::SETOGT, VT, Expand); - setOperationAction(ISD::SETOLE, VT, Expand); - setOperationAction(ISD::SETULT, VT, Expand); - setOperationAction(ISD::SETUGE, VT, Expand); - setOperationAction(ISD::SETUGT, VT, Expand); - setOperationAction(ISD::SETULE, VT, Expand); - } - - for (MVT VT : IntTypes) { - // GPU also does not have divrem function for signed or unsigned - setOperationAction(ISD::SDIVREM, VT, Expand); - - // GPU does not have [S|U]MUL_LOHI functions as a single instruction - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - - setOperationAction(ISD::BSWAP, VT, Expand); - - // GPU doesn't have any counting operators - setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - } - - for (MVT VT : VectorTypes) { - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - // setOperationAction(ISD::VSETCC, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - - } - setOperationAction(ISD::MULHU, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::v2i64, Expand); - setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::MULHS, MVT::v2i64, Expand); - setOperationAction(ISD::ADD, MVT::v2i64, Expand); - setOperationAction(ISD::SREM, MVT::v2i64, Expand); - setOperationAction(ISD::Constant , MVT::i64 , Legal); - setOperationAction(ISD::SDIV, MVT::v2i64, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand); - if (STM.hasHWFP64()) { - // we support loading/storing v2f64 but not operations on the type - setOperationAction(ISD::FADD, MVT::v2f64, Expand); - setOperationAction(ISD::FSUB, MVT::v2f64, Expand); - setOperationAction(ISD::FMUL, MVT::v2f64, Expand); - setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::ConstantFP , MVT::f64 , Legal); - // We want to expand vector conversions into their scalar - // counterparts. - setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::FABS, MVT::f64, Expand); - setOperationAction(ISD::FABS, MVT::v2f64, Expand); - } - // TODO: Fix the UDIV24 algorithm so it works for these - // types correctly. This needs vector comparisons - // for this to work correctly. - setOperationAction(ISD::UDIV, MVT::v2i8, Expand); - setOperationAction(ISD::UDIV, MVT::v4i8, Expand); - setOperationAction(ISD::UDIV, MVT::v2i16, Expand); - setOperationAction(ISD::UDIV, MVT::v4i16, Expand); - setOperationAction(ISD::SUBC, MVT::Other, Expand); - setOperationAction(ISD::ADDE, MVT::Other, Expand); - setOperationAction(ISD::ADDC, MVT::Other, Expand); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); - - - // Use the default implementation. - setOperationAction(ISD::ConstantFP , MVT::f32 , Legal); - setOperationAction(ISD::Constant , MVT::i32 , Legal); - - setSchedulingPreference(Sched::RegPressure); - setPow2DivIsCheap(false); - setSelectIsExpensive(true); - setJumpIsExpensive(true); - - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; - -} - -bool -AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, unsigned Intrinsic) const { - return false; -} - -// The backend supports 32 and 64 bit floating point immediates -bool -AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 - || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { - return true; - } else { - return false; - } -} - -bool -AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { - if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 - || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { - return false; - } else { - return true; - } -} - - -// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to -// be zero. Op is expected to be a target specific node. Used by DAG -// combiner. - -//===----------------------------------------------------------------------===// -// Other Lowering Hooks -//===----------------------------------------------------------------------===// - -SDValue -AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType(); - SDValue DST; - if (OVT.getScalarType() == MVT::i64) { - DST = LowerSDIV64(Op, DAG); - } else if (OVT.getScalarType() == MVT::i32) { - DST = LowerSDIV32(Op, DAG); - } else if (OVT.getScalarType() == MVT::i16 - || OVT.getScalarType() == MVT::i8) { - DST = LowerSDIV24(Op, DAG); - } else { - DST = SDValue(Op.getNode(), 0); - } - return DST; -} - -SDValue -AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType(); - SDValue DST; - if (OVT.getScalarType() == MVT::i64) { - DST = LowerSREM64(Op, DAG); - } else if (OVT.getScalarType() == MVT::i32) { - DST = LowerSREM32(Op, DAG); - } else if (OVT.getScalarType() == MVT::i16) { - DST = LowerSREM16(Op, DAG); - } else if (OVT.getScalarType() == MVT::i8) { - DST = LowerSREM8(Op, DAG); - } else { - DST = SDValue(Op.getNode(), 0); - } - return DST; -} - -EVT -AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const { - int iSize = (size * numEle); - int vEle = (iSize >> ((size == 64) ? 6 : 5)); - if (!vEle) { - vEle = 1; - } - if (size == 64) { - if (vEle == 1) { - return EVT(MVT::i64); - } else { - return EVT(MVT::getVectorVT(MVT::i64, vEle)); - } - } else { - if (vEle == 1) { - return EVT(MVT::i32); - } else { - return EVT(MVT::getVectorVT(MVT::i32, vEle)); - } - } -} - -SDValue -AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue Cond = Op.getOperand(1); - SDValue Jump = Op.getOperand(2); - SDValue Result; - Result = DAG.getNode( - AMDGPUISD::BRANCH_COND, - SDLoc(Op), - Op.getValueType(), - Chain, Jump, Cond); - return Result; -} - -SDValue -AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - MVT INTTY; - MVT FLTTY; - if (!OVT.isVector()) { - INTTY = MVT::i32; - FLTTY = MVT::f32; - } else if (OVT.getVectorNumElements() == 2) { - INTTY = MVT::v2i32; - FLTTY = MVT::v2f32; - } else if (OVT.getVectorNumElements() == 4) { - INTTY = MVT::v4i32; - FLTTY = MVT::v4f32; - } - unsigned bitsize = OVT.getScalarType().getSizeInBits(); - // char|short jq = ia ^ ib; - SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); - - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); - - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, INTTY); - - // int ia = (int)LHS; - SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); - - // int ib, (int)RHS; - SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); - - // float fa = (float)ia; - SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); - - // float fb = (float)ib; - SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); - - // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb); - - // fq = trunc(fq); - fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); - - // float fqneg = -fq; - SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); - - // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, - DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); - - // int iq = (int)fq; - SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); - - // fr = fabs(fr); - fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); - - // fb = fabs(fb); - fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); - - // int cv = fr >= fb; - SDValue cv; - if (INTTY == MVT::i32) { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } else { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } - // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, - DAG.getConstant(0, OVT)); - // dst = iq + jq; - iq = DAG.getSExtOrTrunc(iq, DL, OVT); - iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); - return iq; -} - -SDValue -AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSDIV32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r0, r0, r1 - // ixor r10, r10, r11 - // iadd r0, r0, r10 - // ixor DST, r0, r10 - - // mov r0, LHS - SDValue r0 = LHS; - - // mov r1, RHS - SDValue r1 = RHS; - - // ilt r10, r0, 0 - SDValue r10 = DAG.getSelectCC(DL, - r0, DAG.getConstant(0, OVT), - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - ISD::SETLT); - - // ilt r11, r1, 0 - SDValue r11 = DAG.getSelectCC(DL, - r1, DAG.getConstant(0, OVT), - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - ISD::SETLT); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); - - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); - - // udiv r0, r0, r1 - r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); - - // ixor r10, r10, r11 - r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} - -SDValue -AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); -} - -SDValue -AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - MVT INTTY = MVT::i32; - if (OVT == MVT::v2i8) { - INTTY = MVT::v2i32; - } else if (OVT == MVT::v4i8) { - INTTY = MVT::v4i32; - } - SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); - SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); - LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); - LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); - return LHS; -} - -SDValue -AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - MVT INTTY = MVT::i32; - if (OVT == MVT::v2i16) { - INTTY = MVT::v2i32; - } else if (OVT == MVT::v4i16) { - INTTY = MVT::v4i32; - } - SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); - SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); - LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); - LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); - return LHS; -} - -SDValue -AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSREM32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r20, r0, r1 - // umul r20, r20, r1 - // sub r0, r0, r20 - // iadd r0, r0, r10 - // ixor DST, r0, r10 - - // mov r0, LHS - SDValue r0 = LHS; - - // mov r1, RHS - SDValue r1 = RHS; - - // ilt r10, r0, 0 - SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); - - // ilt r11, r1, 0 - SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); - - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); - - // udiv r20, r0, r1 - SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); - - // umul r20, r20, r1 - r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); - - // sub r0, r0, r20 - r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} - -SDValue -AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); -} diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td deleted file mode 100644 index 0f0c88d..0000000 --- a/lib/Target/R600/AMDILInstrInfo.td +++ /dev/null @@ -1,150 +0,0 @@ -//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// This file describes the AMDIL instructions in TableGen format. -// -//===----------------------------------------------------------------------===// -//===--------------------------------------------------------------------===// -// Custom Operands -//===--------------------------------------------------------------------===// -def brtarget : Operand<OtherVT>; - -//===--------------------------------------------------------------------===// -// Custom Selection DAG Type Profiles -//===--------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Generic Profile Types -//===----------------------------------------------------------------------===// - -def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [ - SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> - ]>; -def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [ - SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3> - ]>; -def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [ - SDTCisEltOfVec<1, 0> - ]>; - -//===----------------------------------------------------------------------===// -// Flow Control Profile Types -//===----------------------------------------------------------------------===// -// Branch instruction where second and third are basic blocks -def SDTIL_BRCond : SDTypeProfile<0, 2, [ - SDTCisVT<0, OtherVT> - ]>; - -//===--------------------------------------------------------------------===// -// Custom Selection DAG Nodes -//===--------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Flow Control DAG Nodes -//===----------------------------------------------------------------------===// -def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; - -//===----------------------------------------------------------------------===// -// Call/Return DAG Nodes -//===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; - -//===--------------------------------------------------------------------===// -// Instructions -//===--------------------------------------------------------------------===// -// Floating point math functions -def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>; - -//===----------------------------------------------------------------------===// -// Integer functions -//===----------------------------------------------------------------------===// -def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp, - [SDNPCommutative, SDNPAssociative]>; - -//===--------------------------------------------------------------------===// -// Custom Pattern DAG Nodes -//===--------------------------------------------------------------------===// -def global_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast<StoreSDNode>(N)); -}]>; - -//===----------------------------------------------------------------------===// -// Load pattern fragments -//===----------------------------------------------------------------------===// -// Global address space loads -def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; -// Constant address space loads -def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; - -//===----------------------------------------------------------------------===// -// Complex addressing mode patterns -//===----------------------------------------------------------------------===// -def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>; -def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>; -def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>; -def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>; - -//===----------------------------------------------------------------------===// -// Instruction format classes -//===----------------------------------------------------------------------===// -class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> -: Instruction { - - let Namespace = "AMDGPU"; - dag OutOperandList = outs; - dag InOperandList = ins; - let Pattern = pattern; - let AsmString = !strconcat(asmstr, "\n"); - let isPseudo = 1; - let Itinerary = NullALU; - bit hasIEEEFlag = 0; - bit hasZeroOpFlag = 0; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; -} - -//===--------------------------------------------------------------------===// -// Multiclass Instruction formats -//===--------------------------------------------------------------------===// -// Multiclass that handles branch instructions -multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { - def _i32 : ILFormat<(outs), - (ins brtarget:$target, rci:$src0), - "; i32 Pseudo branch instruction", - [(Op bb:$target, (i32 rci:$src0))]>; - def _f32 : ILFormat<(outs), - (ins brtarget:$target, rcf:$src0), - "; f32 Pseudo branch instruction", - [(Op bb:$target, (f32 rcf:$src0))]>; -} - -// Only scalar types should generate flow control -multiclass BranchInstr<string name> { - def _i32 : ILFormat<(outs), (ins GPRI32:$src), - !strconcat(name, " $src"), []>; - def _f32 : ILFormat<(outs), (ins GPRF32:$src), - !strconcat(name, " $src"), []>; -} -// Only scalar types should generate flow control -multiclass BranchInstr2<string name> { - def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1), - !strconcat(name, " $src0, $src1"), []>; - def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1), - !strconcat(name, " $src0, $src1"), []>; -} - -//===--------------------------------------------------------------------===// -// Intrinsics support -//===--------------------------------------------------------------------===// -include "AMDILIntrinsics.td" diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td deleted file mode 100644 index 4a3e02e..0000000 --- a/lib/Target/R600/AMDILIntrinsics.td +++ /dev/null @@ -1,224 +0,0 @@ -//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// This file defines all of the amdil-specific intrinsics -// -//===---------------------------------------------------------------===// -//===--------------------------------------------------------------------===// -// Intrinsic classes -// Generic versions of the above classes but for Target specific intrinsics -// instead of SDNode patterns. -//===--------------------------------------------------------------------===// -let TargetPrefix = "AMDIL", isTarget = 1 in { - class VoidIntLong : - Intrinsic<[llvm_i64_ty], [], []>; - class VoidIntInt : - Intrinsic<[llvm_i32_ty], [], []>; - class VoidIntBool : - Intrinsic<[llvm_i32_ty], [], []>; - class UnaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class UnaryIntFloat : - Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class ConvertIntFTOI : - Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; - class ConvertIntITOF : - Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>; - class UnaryIntNoRetInt : - Intrinsic<[], [llvm_anyint_ty], []>; - class UnaryIntNoRetFloat : - Intrinsic<[], [llvm_anyfloat_ty], []>; - class BinaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class BinaryIntFloat : - Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class BinaryIntNoRetInt : - Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>; - class BinaryIntNoRetFloat : - Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>; - class TernaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class TernaryIntFloat : - Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class QuaternaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class UnaryAtomicInt : - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class BinaryAtomicInt : - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class TernaryAtomicInt : - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - class UnaryAtomicIntNoRet : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class BinaryAtomicIntNoRet : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class TernaryAtomicIntNoRet : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; -} - -let TargetPrefix = "AMDIL", isTarget = 1 in { - def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; - - def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, - UnaryIntInt; - def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, - UnaryIntInt; - def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">, - UnaryIntInt; - def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">, - UnaryIntInt; - def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">, - UnaryIntInt; - def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">, - TernaryIntInt; - def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">, - TernaryIntInt; - def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">, - QuaternaryIntInt; - def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">, - TernaryIntInt; - def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">, - BinaryIntInt; - def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">, - BinaryIntInt; - def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, - BinaryIntInt; - def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, - BinaryIntInt; - def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, - BinaryIntInt; - def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">, - BinaryIntInt; - def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">, - BinaryIntInt; - def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">, - BinaryIntInt; - def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">, - BinaryIntInt; - def int_AMDIL_min : GCCBuiltin<"__amdil_min">, - BinaryIntFloat; - def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">, - BinaryIntInt; - def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">, - BinaryIntInt; - def int_AMDIL_max : GCCBuiltin<"__amdil_max">, - BinaryIntFloat; - def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">, - TernaryIntInt; - def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">, - TernaryIntInt; - def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">, - TernaryIntInt; - def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">, - UnaryIntFloat; - def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">, - TernaryIntFloat; - def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">, - UnaryIntFloat; - def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">, - UnaryIntFloat; - def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">, - UnaryIntFloat; - def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">, - UnaryIntFloat; - def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">, - UnaryIntFloat; - def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">, - UnaryIntFloat; - def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">, - UnaryIntFloat; - def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">, - UnaryIntFloat; - def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">, - UnaryIntFloat; - def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">, - UnaryIntFloat; - def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">, - UnaryIntFloat; - def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">, - UnaryIntFloat; - def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat; - def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat; - def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt; - def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">, - UnaryIntFloat; - def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">, - UnaryIntFloat; - def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">, - UnaryIntFloat; - def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">, - UnaryIntFloat; - def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">, - UnaryIntFloat; - def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">, - UnaryIntFloat; - def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">, - UnaryIntFloat; - def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">, - UnaryIntFloat; - def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">, - TernaryIntFloat; - def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">, - UnaryIntFloat; - def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">, - UnaryIntFloat; - def int_AMDIL_length : GCCBuiltin<"__amdil_length">, - UnaryIntFloat; - def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">, - TernaryIntFloat; - def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">, - Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, - llvm_v4i32_ty, llvm_i32_ty], []>; - - def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">, - Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; - def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, - Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; - def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, - Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; - def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, - ConvertIntITOF; - def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">, - ConvertIntFTOI; - def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">, - Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>; - def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">, - ConvertIntITOF; - def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">, - ConvertIntITOF; - def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">, - ConvertIntITOF; - def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">, - ConvertIntITOF; - def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">, - Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, - llvm_v2f32_ty, llvm_float_ty], []>; - def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">, - Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, - llvm_v2f32_ty], []>; - def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">, - Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, - llvm_v4f32_ty], []>; - def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">, - Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, - llvm_v4f32_ty], []>; -} diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td deleted file mode 100644 index b9d0334..0000000 --- a/lib/Target/R600/AMDILRegisterInfo.td +++ /dev/null @@ -1,107 +0,0 @@ -//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// Declarations that describe the AMDIL register file -// -//===----------------------------------------------------------------------===// - -class AMDILReg<bits<16> num, string n> : Register<n> { - field bits<16> Value; - let Value = num; - let Namespace = "AMDGPU"; -} - -// We will start with 8 registers for each class before expanding to more -// Since the swizzle is added based on the register class, we can leave it -// off here and just specify different registers for different register classes -def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>; -def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>; -def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>; -def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>; -def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>; -def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>; -def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>; -def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>; -def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>; -def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>; -def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>; -def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>; -def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>; -def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>; -def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>; -def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>; -def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>; -def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>; -def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>; -def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>; - -// All registers between 1000 and 1024 are reserved and cannot be used -// unless commented in this section -// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's -// r1020 is used to hold the frame index for local arrays -// r1019 is used to hold the dynamic stack allocation pointer -// r1018 is used as a temporary register for handwritten code -// r1017 is used as a temporary register for handwritten code -// r1016 is used as a temporary register for load/store code -// r1015 is used as a temporary register for data segment offset -// r1014 is used as a temporary register for store code -// r1013 is used as the section data pointer register -// r1012-r1010 and r1001-r1008 are used for temporary I/O registers -// r1009 is used as the frame pointer register -// r999 is used as the mem register. -// r998 is used as the return address register. -//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>; -//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>; -//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>; -//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>; -//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>; -//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>; -def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>; -def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>; -def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>; -def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>; -def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>; -def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>; -def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>; -def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>; -def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>; -def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>; -def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>; -def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>; -def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>; -def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>; -def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>; -def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>; -def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>; -def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>; -def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>; -def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>; -def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>; -def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>; -def GPRI16 : RegisterClass<"AMDGPU", [i16], 16, - (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { - let AltOrders = [(add (sequence "R%u", 1, 20))]; - let AltOrderSelect = [{ - return 1; - }]; - } -def GPRI32 : RegisterClass<"AMDGPU", [i32], 32, - (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { - let AltOrders = [(add (sequence "R%u", 1, 20))]; - let AltOrderSelect = [{ - return 1; - }]; - } -def GPRF32 : RegisterClass<"AMDGPU", [f32], 32, - (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { - let AltOrders = [(add (sequence "R%u", 1, 20))]; - let AltOrderSelect = [{ - return 1; - }]; - } diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 3c6fa5a..4d16082 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -13,10 +13,9 @@ add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(R600CodeGen AMDILCFGStructurizer.cpp - AMDILIntrinsicInfo.cpp - AMDILISelLowering.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp + AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp AMDGPUMCInstLower.cpp AMDGPUMachineFunction.cpp @@ -24,8 +23,8 @@ add_llvm_target(R600CodeGen AMDGPUTargetMachine.cpp AMDGPUTargetTransformInfo.cpp AMDGPUISelLowering.cpp - AMDGPUConvertToISA.cpp AMDGPUInstrInfo.cpp + AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp @@ -41,6 +40,7 @@ add_llvm_target(R600CodeGen R600TextureIntrinsicsReplacer.cpp SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp + SIFixSGPRLiveRanges.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index 2065441..dcb7e98 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -295,7 +295,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)), def : Pat<(i32 (sext_inreg i32:$src, i16)), (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; -defm : BFIPatterns <BFI_INT_eg>; +defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>; def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], @@ -326,6 +326,8 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; + let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; } @@ -346,7 +348,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>, + (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, R600ALU_Word0, R600ALU_Word1_OP2 <0x54> { @@ -375,6 +377,11 @@ def GROUP_BARRIER : InstR600 < let ALUInst = 1; } +def : Pat < + (int_AMDGPU_barrier_global), + (GROUP_BARRIER) +>; + //===----------------------------------------------------------------------===// // LDS Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 11ae091..0927040 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -99,9 +99,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { return; } - // The low 8 bits encoding value is the register index, for both VGPRs and - // SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); if (NumRegs == 1) { O << Type << RegIdx; return; @@ -216,13 +216,8 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - union Literal { - float f; - int32_t i; - } L; - - L.i = MI->getOperand(OpNo).getImm(); - O << L.i << "(" << L.f << ")"; + int32_t Imm = MI->getOperand(OpNo).getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 5e7cefe..dc1344f 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -172,17 +172,13 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixup, const MCSubtargetInfo &STI) const { if (MO.isReg()) { - if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) return MRI.getEncodingValue(MO.getReg()); - } else { - return getHWReg(MO.getReg()); - } - } else if (MO.isImm()) { - return MO.getImm(); - } else { - assert(0); - return 0; + return getHWReg(MO.getReg()); } + + assert(MO.isImm()); + return MO.getImm(); } #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index d255e96..d98a6db 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -14,6 +14,7 @@ #include "llvm/Support/Debug.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index d6c6830..7f3560a 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -13,6 +13,9 @@ //===----------------------------------------------------------------------===// #include "R600ISelLowering.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" @@ -65,6 +68,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::FSUB, MVT::f32, Expand); @@ -133,19 +137,47 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setOperationAction(ISD::SUB, MVT::i64, Expand); + // These should be replaced by UDVIREM, but it does not happen automatically // during Type Legalization setOperationAction(ISD::UDIV, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); + setOperationAction(ISD::SDIV, MVT::i64, Custom); + setOperationAction(ISD::SREM, MVT::i64, Custom); + + // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 + // to be Legal/Custom in order to avoid library calls. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + } + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::Source); @@ -537,11 +569,24 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); case ISD::FCOS: case ISD::FSIN: return LowerTrig(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); @@ -776,6 +821,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; @@ -793,20 +841,172 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); - return; + case ISD::UDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM); + break; } - case ISD::STORE: - SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); - Results.push_back(SDValue(Node, 0)); - return; + case ISD::UREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM.getValue(1)); + break; + } + case ISD::SDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(SDIVREM); + break; + } + case ISD::SREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(SDIVREM.getValue(1)); + break; + } + case ISD::SDIVREM: { + SDValue Op = SDValue(N, 1); + SDValue RES = LowerSDIVREM(Op, DAG); + Results.push_back(RES); + Results.push_back(RES.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, HalfVT); + SDValue zero = DAG.getConstant(0, HalfVT); + + //HiLo split + SDValue LHS = N->getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = N->getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Hi = zero; + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); + // Get Value of high bit + SDValue HBit; + if (halfBitWidth == 32 && Subtarget->hasBFE()) { + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); + } else { + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + } + + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, + DAG.getConstant(halfBitWidth - 1, HalfVT)); + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); + + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); + + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); + } + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); + break; } + } +} + +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, + SDValue Vector) const { + + SDLoc DL(Vector); + EVT VecVT = Vector.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + SmallVector<SDValue, 8> Args; + + for (unsigned i = 0, e = VecVT.getVectorNumElements(); + i != e; ++i) { + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, + Vector, DAG.getConstant(i, getVectorIdxTy()))); + } + + return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); +} + +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Index = Op.getOperand(1); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Index); +} + +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Value, Index); + return vectorToVerticalVector(DAG, Insert); } SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { @@ -840,6 +1040,80 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { DAG.getConstantFP(3.14159265359, MVT::f32)); } +SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, VT); + SDValue One = DAG.getConstant(1, VT); + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); + Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); + HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); + SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); + + SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); + SDValue LoBig = Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, VT); + SDValue One = DAG.getConstant(1, VT); + + const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); + Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); + SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); + LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); + + SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); + SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode( ISD::SETCC, @@ -1369,6 +1643,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(Ops, DL); } +SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + + return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), + Chain, Jump, Cond); +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. @@ -1902,9 +2185,8 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SDValue FakeOp; std::vector<SDValue> Ops; - for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end(); - I != E; ++I) - Ops.push_back(*I); + for (const SDUse &I : Node->ops()) + Ops.push_back(I); if (Opcode == AMDGPU::DOT_4) { int OperandIdx[] = { diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index a8a464f..d22c8c9 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -51,15 +51,18 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const; + SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; - /// \brief Lower ROTL opcode to BITALIGN - SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b0d9ae3..3972e2f 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -28,10 +28,9 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenDFAPacketizer.inc" -R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) - : AMDGPUInstrInfo(tm), - RI(tm), - ST(tm.getSubtarget<AMDGPUSubtarget>()) +R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), + RI(st) { } const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { @@ -52,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; - if (AMDGPU::R600_Reg128RegClass.contains(DestReg) && - AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { + if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { VectorComponents = 4; - } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) && - AMDGPU::R600_Reg64RegClass.contains(SrcReg)) { + } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { VectorComponents = 2; } @@ -768,16 +771,6 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; } -int R600InstrInfo::getBranchInstr(const MachineOperand &op) const { - const MachineInstr *MI = op.getParent(); - - switch (MI->getDesc().OpInfo->RegClass) { - default: // FIXME: fallthrough?? - case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32; - case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32; - }; -} - static MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); @@ -1064,10 +1057,34 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return 2; } +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::R600_EXTRACT_ELT_V2: + case AMDGPU::R600_EXTRACT_ELT_V4: + buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(2).getReg(), + RI.getHWRegChan(MI->getOperand(1).getReg())); + break; + case AMDGPU::R600_INSERT_ELT_V2: + case AMDGPU::R600_INSERT_ELT_V4: + buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + break; + } + MI->eraseFromParent(); + return true; +} + void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering()); + static_cast<const AMDGPUFrameLowering*>( + MF.getTarget().getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); int End = getIndirectIndexEnd(MF); @@ -1100,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { - unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); setImmOperand(MOVA, AMDGPU::OpName::write, 0); @@ -1117,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { - unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); @@ -1220,7 +1267,6 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( const { assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); unsigned Opcode; - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::R700) Opcode = AMDGPU::DOT4_r600; else diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index b5304a0..45a57d3 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -32,12 +32,22 @@ namespace llvm { class R600InstrInfo : public AMDGPUInstrInfo { private: const R600RegisterInfo RI; - const AMDGPUSubtarget &ST; - int getBranchInstr(const MachineOperand &op) const; std::vector<std::pair<int, unsigned> > ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; public: enum BankSwizzle { ALU_VEC_012_SCL_210 = 0, @@ -48,7 +58,7 @@ namespace llvm { ALU_VEC_210 }; - explicit R600InstrInfo(AMDGPUTargetMachine &tm); + explicit R600InstrInfo(const AMDGPUSubtarget &st); const R600RegisterInfo &getRegisterInfo() const override; void copyPhysReg(MachineBasicBlock &MBB, @@ -197,6 +207,8 @@ namespace llvm { int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const override { return 1;} + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 590fde2..73fa345 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -125,7 +125,7 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern, class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, InstrItinClass itin = AnyALU> : R600_1OP <inst, opName, - [(set R600_Reg32:$dst, (node R600_Reg32:$src0))] + [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin >; // If you add or change the operands for R600_2OP instructions, you must @@ -161,10 +161,10 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern, } class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, - InstrItinClass itim = AnyALU> : + InstrItinClass itin = AnyALU> : R600_2OP <inst, opName, [(set R600_Reg32:$dst, (node R600_Reg32:$src0, - R600_Reg32:$src1))] + R600_Reg32:$src1))], itin >; // If you add our change the operands for R600_3OP instructions, you must @@ -721,14 +721,11 @@ def SETNE_DX10 : R600_2OP < >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; -def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; -// Add also ftrunc intrinsic pattern -def : Pat<(ftrunc f32:$src0), (TRUNC $src0)>; - def MOV : R600_1OP <0x19, "MOV", []>; let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { @@ -1082,18 +1079,21 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < let Itinerary = TransALU; } +// Clamped to maximum. class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped > { let Itinerary = TransALU; } -class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP < - inst, "RECIPSQRT_IEEE", [] +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy > { let Itinerary = TransALU; } +// TODO: There is also RECIPSQRT_FF which clamps to zero. + class SIN_Common <bits<11> inst> : R600_1OP < inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ let Trig = 1; @@ -1266,13 +1266,6 @@ defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; //===----------------------------------------------------------------------===// -// Branch Instructions -//===----------------------------------------------------------------------===// - -def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src), - "IF_PREDICATE_SET $src", []>; - -//===----------------------------------------------------------------------===// // Pseudo instructions //===----------------------------------------------------------------------===// @@ -1345,15 +1338,6 @@ def TXD_SHADOW: InstR600 < } // End isPseudo = 1 } // End usesCustomInserter = 1 -//===---------------------------------------------------------------------===// -// Return instruction -//===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, - usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; -} - //===----------------------------------------------------------------------===// // Constant Buffer Addressing Support @@ -1480,11 +1464,52 @@ let Inst{63-32} = Word1; let VTXInst = 1; } +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, rci:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, (i32 rci:$src0))]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, rcf:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, (f32 rcf:$src0))]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} - -//===--------------------------------------------------------------------===// -// Instructions support -//===--------------------------------------------------------------------===// //===---------------------------------------------------------------------===// // Custom Inserter for Branches and returns, this eventually will be a // separate pass @@ -1497,13 +1522,22 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { } //===---------------------------------------------------------------------===// -// Flow and Program control Instructions +// Return instruction //===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1 in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(IL_retflag)]>; +} + +//===----------------------------------------------------------------------===// +// Branch Instructions +//===----------------------------------------------------------------------===// + +def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), + "IF_PREDICATE_SET $src", []>; + let isTerminator=1 in { - def SWITCH : ILFormat< (outs), (ins GPRI32:$src), - !strconcat("SWITCH", " $src"), []>; - def CASE : ILFormat< (outs), (ins GPRI32:$src), - !strconcat("CASE", " $src"), []>; def BREAK : ILFormat< (outs), (ins), "BREAK", []>; def CONTINUE : ILFormat< (outs), (ins), @@ -1548,6 +1582,60 @@ let isTerminator=1 in { } //===----------------------------------------------------------------------===// +// Indirect addressing pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +class ExtractVertical <RegisterClass vec_rc> : InstR600 < + (outs R600_Reg32:$dst), + (ins vec_rc:$vec, R600_Reg32:$index), "", + [], + AnyALU +>; + +let Constraints = "$dst = $vec" in { + +class InsertVertical <RegisterClass vec_rc> : InstR600 < + (outs vec_rc:$dst), + (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", + [], + AnyALU +>; + +} // End Constraints = "$dst = $vec" + +} // End isPseudo = 1 + +def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>; +def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>; + +def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>; +def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>; + +class ExtractVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (scalar_ty (extractelt vec_ty:$vec, i32:$index)), + (inst $vec, $index) +>; + +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>; + +class InsertVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), + (inst $vec, $value, $index) +>; + +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; + +//===----------------------------------------------------------------------===// // ISel Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index d1655d1..7ea654c 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "R600MachineScheduler.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index c2f6c03..74cf309 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -16,6 +16,7 @@ #include "llvm/Support/Debug.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index f3bb88b..dc95675 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -20,15 +20,14 @@ using namespace llvm; -R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm) -: AMDGPURegisterInfo(tm), - TM(tm) +R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st) +: AMDGPURegisterInfo(st) { RCW.RegWeight = 0; RCW.WeightLimit = 0;} BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); + const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo()); Reserved.set(AMDGPU::ZERO); Reserved.set(AMDGPU::HALF); @@ -55,16 +54,6 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -const TargetRegisterClass * -R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { - switch (rc->getID()) { - case AMDGPU::GPRF32RegClassID: - case AMDGPU::GPRI32RegClassID: - return &AMDGPU::R600_Reg32RegClass; - default: return rc; - } -} - unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; } diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index 52e1a4b..247808b 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -16,26 +16,18 @@ #define R600REGISTERINFO_H_ #include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" namespace llvm { -class R600TargetMachine; +class AMDGPUSubtarget; struct R600RegisterInfo : public AMDGPURegisterInfo { - AMDGPUTargetMachine &TM; RegClassWeight RCW; - R600RegisterInfo(AMDGPUTargetMachine &tm); + R600RegisterInfo(const AMDGPUSubtarget &st); BitVector getReservedRegs(const MachineFunction &MF) const override; - /// \param RC is an AMDIL reg class. - /// - /// \returns the R600 reg class that is equivalent to \p RC. - const TargetRegisterClass *getISARegClass( - const TargetRegisterClass *RC) const override; - /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 68bcd20..cc667d9 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> : class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1, sub2, sub3]; - let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; } class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; } +class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 < + "V"#lo#hi#"_"#chan, + [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)], + lo +>; foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { @@ -54,6 +64,24 @@ foreach Index = 0-127 in { Index>; } +foreach Chan = [ "X", "Y", "Z", "W"] in { + + let chan_encoding = !if(!eq(Chan, "X"), 0, + !if(!eq(Chan, "Y"), 1, + !if(!eq(Chan, "Z"), 2, + !if(!eq(Chan, "W"), 3, 0)))) in { + def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, + [!cast<Register>("T0_"#Chan), + !cast<Register>("T1_"#Chan), + !cast<Register>("T2_"#Chan), + !cast<Register>("T3_"#Chan)], + 0>; + def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; + def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; + } +} + + // KCACHE_BANK0 foreach Index = 159-128 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { @@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>; let isAllocatable = 0 in { -// XXX: Only use the X channel, until we support wider stack widths -def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>; +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; + +// We only use Addr_[YZW] for vertical vectors. +// FIXME if we add more vertical vector registers we will need to ad more +// registers to these classes. +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; @@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, let CopyCost = -1; } +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add V0123_W, V0123_Z, V0123_Y, V0123_X) +>; + def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, (add (sequence "T%u_XY", 0, 63))>; + +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add V01_X, V01_Y, V01_Z, V01_W, + V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp index d6e4451..91eb60b 100644 --- a/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -65,7 +65,6 @@ class SIAnnotateControlFlow : public FunctionPass { DominatorTree *DT; StackVector Stack; - SSAUpdater PhiInserter; bool isTopOfStack(BasicBlock *BB); @@ -81,7 +80,7 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - void handleLoopCondition(Value *Cond); + Value *handleLoopCondition(Value *Cond, PHINode *Broken); void handleLoop(BranchInst *Term); @@ -177,7 +176,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { } else { if (Phi->getIncomingValue(i) != BoolFalse) return false; - + } } return true; @@ -204,20 +203,26 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { } /// \brief Recursively handle the condition leading to a loop -void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) { if (PHINode *Phi = dyn_cast<PHINode>(Cond)) { + BasicBlock *Parent = Phi->getParent(); + PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + Value *Ret = NewPhi; // Handle all non-constant incoming values first for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { Value *Incoming = Phi->getIncomingValue(i); - if (isa<ConstantInt>(Incoming)) + BasicBlock *From = Phi->getIncomingBlock(i); + if (isa<ConstantInt>(Incoming)) { + NewPhi->addIncoming(Broken, From); continue; + } Phi->setIncomingValue(i, BoolFalse); - handleLoopCondition(Incoming); + Value *PhiArg = handleLoopCondition(Incoming, Broken); + NewPhi->addIncoming(PhiArg, From); } - BasicBlock *Parent = Phi->getParent(); BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { @@ -230,33 +235,28 @@ void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { if (From == IDom) { CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { - OldEnd->getArgOperand(0), - PhiInserter.GetValueAtEndOfBlock(Parent) - }; - Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - PhiInserter.AddAvailableValue(Parent, Ret); + Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; + Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); continue; } } - TerminatorInst *Insert = From->getTerminator(); - Value *Arg = PhiInserter.GetValueAtEndOfBlock(From); - Value *Ret = CallInst::Create(Break, Arg, "", Insert); - PhiInserter.AddAvailableValue(From, Ret); + Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); + NewPhi->setIncomingValue(i, PhiArg); } eraseIfUnused(Phi); + return Ret; } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); TerminatorInst *Insert = Parent->getTerminator(); - Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) }; - Value *Ret = CallInst::Create(IfBreak, Args, "", Insert); - PhiInserter.AddAvailableValue(Parent, Ret); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); } else { llvm_unreachable("Unhandled loop condition!"); } + return 0; } /// \brief Handle a back edge (loop) @@ -264,15 +264,11 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); - PhiInserter.Initialize(Int64, ""); - PhiInserter.AddAvailableValue(Target, Broken); - Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - handleLoopCondition(Cond); + Value *Arg = handleLoopCondition(Cond, Broken); BasicBlock *BB = Term->getParent(); - Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB); for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); PI != PE; ++PI) { diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 2cbce28..4d31a11 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -35,4 +35,54 @@ enum { #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + #endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp new file mode 100644 index 0000000..7d116ee --- /dev/null +++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -0,0 +1,110 @@ +//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// SALU instructions ignore control flow, so we need to modify the live ranges +/// of the registers they define. +/// +/// The strategy is to view the entire program as if it were a single basic +/// block and calculate the intervals accordingly. We implement this +/// by walking this list of segments for each LiveRange and setting the +/// end of each segment equal to the start of the segment that immediately +/// follows it. + +#include "AMDGPU.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-sgpr-live-ranges" + +namespace { + +class SIFixSGPRLiveRanges : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnMachineFunction(MachineFunction &MF) override; + + virtual const char *getPassName() const override { + return "SI Fix SGPR live ranges"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<SlotIndexes>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) + +char SIFixSGPRLiveRanges::ID = 0; + +char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; + +FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { + return new SIFixSGPRLiveRanges(); +} + +bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF.getTarget().getRegisterInfo()); + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC); + if (ExecUse) + continue; + + for (const MachineOperand &Def : MI.operands()) { + if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg())) + continue; + + const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg()); + + if (!TRI->isSGPRClass(RC)) + continue; + LiveInterval &LI = LIS->getInterval(Def.getReg()); + for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) { + LiveRange::Segment &Seg = LI.segments[i]; + LiveRange::Segment &Next = LI.segments[i + 1]; + Seg.end = Next.start; + } + } + } + } + + return false; +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index c9e247c..b13c3b8 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,8 +14,8 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h" using namespace llvm; @@ -76,6 +77,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -88,14 +91,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : // We need to custom lower loads/stores from private memory setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); @@ -105,18 +106,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -139,6 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); @@ -215,9 +213,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FRINT, MVT::f64, Legal); } + // FIXME: These should be removed and handled the same was as f32 fneg. Source + // modifiers also work for the double instructions. + setOperationAction(ISD::FNEG, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::UINT_TO_FP); + setSchedulingPreference(Sched::RegPressure); } @@ -265,8 +270,12 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32); } -bool SITargetLowering::shouldSplitVectorType(EVT VT) const { - return VT.getScalarType().bitsLE(MVT::i16); +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -482,19 +491,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } - case AMDGPU::V_SUB_F64: - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(MI->getOperand(2).getReg()) - .addImm(0) /* src2 */ - .addImm(0) /* ABS */ - .addImm(0) /* CLAMP */ - .addImm(0) /* OMOD */ - .addImm(2); /* NEG */ + case AMDGPU::V_SUB_F64: { + unsigned DestReg = MI->getOperand(0).getReg(); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) + .addImm(0) // SRC0 modifiers + .addReg(MI->getOperand(1).getReg()) + .addImm(1) // SRC1 modifiers + .addReg(MI->getOperand(2).getReg()) + .addImm(0) // SRC2 modifiers + .addImm(0) // src2 + .addImm(0) // CLAMP + .addImm(0); // OMOD MI->eraseFromParent(); break; - + } case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -595,27 +605,31 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); + EVT VT = Op.getValueType(); + + // These loads are legal. + if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + if (Op.getValueType().isVector() && (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && Op.getValueType().getVectorNumElements() > 4))) { - SDValue MergedValues[2] = { - SplitVectorLoad(Op, DAG), - Load->getChain() - }; - return DAG.getMergeValues(MergedValues, SDLoc(Op)); + return SplitVectorLoad(Op, DAG); } else { - return LowerLOAD(Op, DAG); + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; } } case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::ANY_EXTEND: // Fall-through - case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = @@ -827,13 +841,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - SDValue MergedValues[2]; - MergedValues[1] = Load->getChain(); - if (Ret.getNode()) { - MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, DL); - } + SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + if (Lowered.getNode()) + return Lowered; if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { return SDValue(); @@ -846,25 +856,38 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), DAG.getConstant(2, MVT::i32)); - Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); + + // FIXME: REGISTER_LOAD should probably have a chain result. + SDValue Chain = Load->getChain(); + SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + + SDValue Ret = LoLoad.getValue(0); if (MemVT.getSizeInBits() == 64) { + // TODO: This needs a test to make sure the right thing is happening with + // the chain. That is hard without general function support. + SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, DAG.getConstant(1, MVT::i32)); - SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Load->getChain(), IncPtr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); + SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, IncPtr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); - Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper); + Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad); + // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + // LoLoad.getValue(1), HiLoad.getValue(1)); } - MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, DL); + SDValue Ops[] = { + Ret, + Chain + }; + return DAG.getMergeValues(Ops, DL); } SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, @@ -903,39 +926,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } -SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); -} - -SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - if (VT != MVT::i64) { - return SDValue(); - } - - SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), - DAG.getConstant(31, MVT::i32)); - - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); -} - SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast<StoreSDNode>(Op); EVT VT = Store->getMemoryVT(); + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -1011,27 +1012,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Chain; } +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) + return SDValue(); -SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); - if (VT != MVT::i64) { + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; + } + } + + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { return SDValue(); } - SDValue Src = Op.getOperand(0); - if (Src.getValueType() != MVT::i32) - Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - SDValue Zero = DAG.getConstant(0, MVT::i32); - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero); -} + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) + return SDValue(); -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + + LoadSDNode *Load = cast<LoadSDNode>(Src); + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector<SDValue, 4> Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector<SDValue, 4> Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); + } + + return SDValue(); +} SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -1074,6 +1147,31 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -1297,7 +1395,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; bool HaveVSrc = false, HaveSSrc = false; - // First figure out what we alread have in this instruction + // First figure out what we already have in this instruction. for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; i != e && Op < NumOps; ++i, ++Op) { @@ -1316,7 +1414,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, } } - // If we neither have VSrc nor SSrc it makes no sense to continue + // If we neither have VSrc nor SSrc, it makes no sense to continue. if (!HaveVSrc && !HaveSSrc) return Node; @@ -1332,17 +1430,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, const SDValue &Operand = Node->getOperand(i); Ops.push_back(Operand); - // Already folded immediate ? + // Already folded immediate? if (isa<ConstantSDNode>(Operand.getNode()) || isa<ConstantFPSDNode>(Operand.getNode())) continue; - // Is this a VSrc or SSrc operand ? + // Is this a VSrc or SSrc operand? unsigned RegClass = Desc->OpInfo[Op].RegClass; if (isVSrc(RegClass) || isSSrc(RegClass)) { // Try to fold the immediates if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't worked, make sure we don't hit the SReg limit + // Folding didn't work, make sure we don't hit the SReg limit. ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; @@ -1371,7 +1469,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, continue; if (DescE64) { - // Test if it makes sense to switch to e64 encoding unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) @@ -1402,7 +1499,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, if (!DescE64) continue; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { Ops.pop_back(); @@ -1412,7 +1509,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, if (!DescE64) continue; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } } @@ -1535,7 +1632,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } -/// \brief Fold the instructions after slecting them +/// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const SIInstrInfo *TII = diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index c6eaa81..e25323a 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -27,10 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; bool foldImm(SDValue &Operand, int32_t &Immediate, @@ -46,11 +43,16 @@ class SITargetLowering : public AMDGPUTargetLowering { void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; + static SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI); + public: SITargetLowering(TargetMachine &tm); bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const override; - bool shouldSplitVectorType(EVT VT) const override; + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index a17fed7..1733326 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 168eff2..7cae9fc 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -51,6 +51,16 @@ class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> : let Size = 8; } +class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP3 = 1; +} + //===----------------------------------------------------------------------===// // Scalar operations //===----------------------------------------------------------------------===// @@ -207,7 +217,7 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : } class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { + VOP3Common <outs, ins, asm, pattern> { bits<8> dst; bits<2> src0_modifiers; @@ -233,16 +243,11 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{61} = src0_modifiers{0}; let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP3 = 1; + } class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { + VOP3Common <outs, ins, asm, pattern> { bits<8> dst; bits<2> src0_modifiers; @@ -266,11 +271,6 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP3 = 1; } class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 4a9e346..455c890 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -19,13 +19,14 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCInstrDesc.h" using namespace llvm; -SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm) - : AMDGPUInstrInfo(tm), - RI(tm) { } +SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), + RI(st) { } //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -187,18 +188,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned KillFlag = isKill ? RegState::Kill : 0; - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { - unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent()); + if (RI.hasVGPRs(RC)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) + .addReg(SrcReg); + } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { + unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF); + unsigned TgtReg = MFI->SpillTracker.LaneVGPR; - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR) + BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg) .addReg(SrcReg, KillFlag) .addImm(Lane); - MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane); + MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane); } else if (RI.isSGPRClass(RC)) { // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for vector @@ -207,8 +215,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // Reserve a spot in the spill tracker for each sub-register of // the vector register. unsigned NumSubRegs = RC->getSize() / 4; - unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(), - NumSubRegs); + unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs); MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, FirstLane); @@ -234,19 +241,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); DebugLoc DL = MBB.findDebugLoc(MI); - if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) { - SIMachineFunctionInfo::SpilledReg Spill = - MFI->SpillTracker.getSpilledReg(FrameIndex); - assert(Spill.VGPR); - BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane); - insertNOPs(MI, 3); + + if (RI.hasVGPRs(RC)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addImm(0); } else if (RI.isSGPRClass(RC)){ unsigned Opcode; switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; @@ -260,7 +267,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addReg(Spill.VGPR) .addImm(FrameIndex); - insertNOPs(MI, 3); } else { llvm_unreachable("VGPR spilling not supported"); } @@ -281,6 +287,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: return 2; + case AMDGPU::SI_SPILL_S32_RESTORE: + return 1; default: llvm_unreachable("Invalid spill opcode"); } } @@ -334,7 +342,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_S64_RESTORE: { + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { @@ -348,6 +357,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { .addReg(MI->getOperand(1).getReg()) .addImm(Spill.Lane + i); } + insertNOPs(MI, 3); MI->eraseFromParent(); break; } @@ -514,6 +524,23 @@ bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const { return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO); } +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + case MachineOperand::MO_FPImmediate: + return Op0.getFPImm() == Op1.getFPImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -532,7 +559,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the register classes are correct for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) { switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: + case MCOI::OPERAND_REGISTER: { + int RegClass = Desc.OpInfo[i].RegClass; + if (!RI.regClassCanUseImmediate(RegClass) && + (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) { + ErrInfo = "Expected register, but got immediate"; + return false; + } + } break; case MCOI::OPERAND_IMMEDIATE: if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) { @@ -620,6 +654,24 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + MI->dump(); + + const MachineOperand &Src0 = MI->getOperand(2); + const MachineOperand &Src1 = MI->getOperand(3); + const MachineOperand &Src2 = MI->getOperand(4); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + return true; } @@ -654,7 +706,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; @@ -667,6 +721,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32; + case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; + case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; } } @@ -731,8 +788,8 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubReg = MRI.createVirtualRegister(SubRC); // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to wory about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to + // value so we don't need to worry about merging its subreg index with the + // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), NewSuperReg) @@ -1157,22 +1214,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; } case AMDGPU::S_AND_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); Inst->eraseFromParent(); continue; @@ -1217,6 +1279,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // 3 to not hit an assertion later in MCInstLower. Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(0)); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + Inst->addOperand(MachineOperand::CreateImm(0)); } addDescImplicitUseDef(NewDesc, Inst); @@ -1297,9 +1363,62 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::VReg_32RegClass; } -void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { +void SIInstrInfo::splitScalar64BitUnaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBinaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { MachineBasicBlock &MBB = *Inst->getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -1360,6 +1479,46 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist, Worklist.push_back(HiHalf); } +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + Worklist.push_back(First); + Worklist.push_back(Second); +} + void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, MachineInstr *Inst) const { // Add the implict and explicit register definitions. diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 7b31a81..4c204d8 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -44,13 +44,19 @@ private: const TargetRegisterClass *RC, const MachineOperand &Op) const; - void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist, - MachineInstr *Inst, unsigned Opcode) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; public: - explicit SIInstrInfo(AMDGPUTargetMachine &tm); + explicit SIInstrInfo(const AMDGPUSubtarget &st); const SIRegisterInfo &getRegisterInfo() const override { return RI; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 2242e6d..774c9d1 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -147,6 +147,12 @@ def FRAMEri32 : Operand<iPTR> { } //===----------------------------------------------------------------------===// +// Complex patterns +//===----------------------------------------------------------------------===// + +def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; + +//===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// @@ -187,6 +193,12 @@ class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < opName#" $dst, $src0", pattern >; +// 64-bit input, 32-bit output. +class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < + op, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern @@ -260,7 +272,7 @@ class SIMCInstr <string pseudo, int subtarget> { multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { - def "" : InstSI <outs, ins, "", pattern>, VOP <opName>, + def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>, SIMCInstr<OpName, SISubtarget.NONE> { let isPseudo = 1; } @@ -357,12 +369,13 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern, } multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, - string opName, ValueType vt, PatLeaf cond> { - + string opName, ValueType vt, PatLeaf cond, bit defExec = 0> { def _e32 : VOPC < op, (ins arc:$src0, vrc:$src1), opName#"_e32 $dst, $src0, $src1", [] - >, VOP <opName>; + >, VOP <opName> { + let Defs = !if(defExec, [VCC, EXEC], [VCC]); + } def _e64 : VOP3 < {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, @@ -375,6 +388,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))] ) >, VOP <opName> { + let Defs = !if(defExec, [EXEC], []); let src2 = SIOperand.ZERO; let src2_modifiers = 0; } @@ -388,6 +402,14 @@ multiclass VOPC_64 <bits<8> op, string opName, ValueType vt = untyped, PatLeaf cond = COND_NULL> : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>; +multiclass VOPCX_32 <bits<8> op, string opName, + ValueType vt = untyped, PatLeaf cond = COND_NULL> + : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>; + +multiclass VOPCX_64 <bits<8> op, string opName, + ValueType vt = untyped, PatLeaf cond = COND_NULL> + : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>; + multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < op, (outs VReg_32:$dst), (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers, @@ -396,7 +418,7 @@ multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName >; -class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 < +class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_32:$src1), opName#" $dst, $src0, $src1", pattern @@ -410,11 +432,29 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 < class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2, + (ins InputMods:$src0_modifiers, VSrc_64:$src0, + InputMods:$src1_modifiers, VSrc_64:$src1, + InputMods:$src2_modifiers, VSrc_64:$src2, + InstFlag:$clamp, InstFlag:$omod), + opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern +>, VOP <opName>; + + +class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> : VOP3 < + op, (outs vrc:$dst0, SReg_64:$dst1), + (ins arc:$src0, arc:$src1, arc:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern + opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern >, VOP <opName>; + +class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; + +class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>; + //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// @@ -475,10 +515,11 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A let vdst = 0; } +// 1 address, 1 data. class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), asm#" $vdst, $addr, $data0, $offset, [M0]", []> { @@ -487,6 +528,41 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < let mayLoad = 1; } +// 1 address, 2 data. +class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < + op, + (outs rc:$vdst), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), + asm#" $vdst, $addr, $data0, $data1, $offset, [M0]", + []> { + let mayStore = 1; + let mayLoad = 1; +} + +// 1 address, 2 data. +class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < + op, + (outs), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), + asm#" $addr, $data0, $data1, $offset, [M0]", + []> { + let mayStore = 1; + let mayLoad = 1; +} + +// 1 address, 1 data. +class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < + op, + (outs), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), + asm#" $addr, $data0, $offset, [M0]", + []> { + + let data1 = 0; + let mayStore = 1; + let mayLoad = 1; +} + class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs), @@ -500,7 +576,9 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU let mayLoad = 0; } -multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { +multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { let lds = 0, mayLoad = 1 in { @@ -542,16 +620,19 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { def _ADDR64 : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), - asm#" $vdata, $srsrc + $vaddr + $offset", []>; + asm#" $vdata, $srsrc + $vaddr + $offset", + [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, + i64:$vaddr, u16imm:$offset)))]>; } } } -class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : +class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, + ValueType store_vt, SDPatternOperator st> : MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), name#" $vdata, $srsrc + $vaddr + $offset", - []> { + [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> { let mayLoad = 0; let mayStore = 1; @@ -658,6 +739,53 @@ multiclass MIMG_Sampler <bits<7> op, string asm> { defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>; } +class MIMG_Gather_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting these two: + let MIMG = 0; + let hasPostISelHook = 0; +} + +multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Gather <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>; +} + //===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 500fa78..b3b44e2 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -96,22 +96,35 @@ def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", [(set i32:$dst, (not i32:$src0))] >; -def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>; +def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", + [(set i64:$dst, (not i64:$src0))] +>; def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; -def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>; +def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", + [(set i32:$dst, (AMDGPUbrev i32:$src0))] +>; def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; } // End neverHasSideEffects = 1 ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; -////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>; -////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>; -////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>; +def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32", + [(set i32:$dst, (ctpop i32:$src0))] +>; +def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>; + +////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>; ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; -////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>; +def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32", + [(set i32:$dst, (cttz_zero_undef i32:$src0))] +>; ////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; -//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>; + +def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", + [(set i32:$dst, (ctlz_zero_undef i32:$src0))] +>; + //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; @@ -320,7 +333,7 @@ def S_CMPK_EQ_I32 : SOPK < >; */ -let isCompare = 1 in { +let isCompare = 1, Defs = [SCC] in { def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; @@ -332,7 +345,7 @@ def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; -} // End isCompare = 1 +} // End isCompare = 1, Defs = [SCC] let Defs = [SCC], isCommutable = 1 in { def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; @@ -467,26 +480,26 @@ defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>; defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">; defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">; -defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">; -defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">; -defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">; -defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">; -defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">; -defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">; -defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">; -defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">; -defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">; -defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">; -defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">; -defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">; -defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">; -defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">; -defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">; +defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">; +defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">; +defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">; +defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">; +defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">; +defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">; +defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">; +defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">; +defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">; +defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">; +defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">; +defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">; +defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">; +defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">; +defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">; +defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">; defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>; @@ -505,26 +518,26 @@ defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>; defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">; defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">; -defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">; -defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">; -defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">; -defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">; -defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">; -defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">; -defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">; -defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">; -defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">; -defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">; -defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">; -defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">; -defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">; -defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">; -defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">; +defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">; +defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">; +defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">; +defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">; +defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">; +defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">; +defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">; +defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">; +defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">; +defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">; +defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">; +defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">; +defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">; +defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">; +defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">; +defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">; defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">; @@ -543,26 +556,26 @@ defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">; defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">; defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">; -defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">; -defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">; -defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">; -defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">; -defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">; -defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">; -defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">; -defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">; -defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">; -defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">; -defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">; -defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">; -defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">; -defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">; -defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">; +defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">; +defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">; +defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">; +defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">; +defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">; +defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">; +defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">; +defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">; +defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">; +defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">; +defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">; +defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">; +defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">; +defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">; +defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">; +defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">; defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">; @@ -611,18 +624,18 @@ defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>; defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>; defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">; -defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">; -defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">; -defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">; -defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">; -defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">; -defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">; -defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">; +defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">; +defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">; +defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">; +defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">; +defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">; +defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">; +defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">; +defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">; defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>; @@ -633,18 +646,18 @@ defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>; defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>; defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">; -defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">; -defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">; -defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">; -defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">; -defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">; -defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">; -defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">; +defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">; +defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">; +defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">; +defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">; +defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">; +defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">; +defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">; +defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">; defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>; @@ -655,18 +668,18 @@ defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>; defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>; defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">; -defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">; -defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">; -defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">; -defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">; -defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">; -defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">; -defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">; +defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">; +defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">; +defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">; +defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">; +defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">; +defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">; +defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">; +defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">; defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>; @@ -677,30 +690,30 @@ defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>; defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>; defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">; -defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">; -defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">; -defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">; -defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">; -defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">; -defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">; -defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">; +defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">; +defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">; +defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">; +defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">; +defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">; +defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">; +defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">; +defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">; -let hasSideEffects = 1, Defs = [EXEC] in { -defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">; -} // End hasSideEffects = 1, Defs = [EXEC] +let hasSideEffects = 1 in { +defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">; +} // End hasSideEffects = 1 defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">; -let hasSideEffects = 1, Defs = [EXEC] in { -defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; -} // End hasSideEffects = 1, Defs = [EXEC] +let hasSideEffects = 1 in { +defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">; +} // End hasSideEffects = 1 } // End isCompare = 1 @@ -708,8 +721,97 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; // DS Instructions //===----------------------------------------------------------------------===// -def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>; -def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>; + +def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>; +def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>; +def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>; +def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>; +def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>; +def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>; +def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>; +def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>; +def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>; +def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>; +def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>; +def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>; +def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>; +def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>; +def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>; +def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>; +def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>; + +def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>; +def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>; +def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>; +def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>; +def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>; +def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>; +def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>; +def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>; +def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>; +def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>; +def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>; +def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>; +def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>; +def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>; +//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>; +//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>; +def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>; +def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>; +def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>; +def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>; + +let SubtargetPredicate = isCI in { +def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>; +} // End isCI + + +def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>; +def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>; +def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>; +def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>; +def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>; +def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>; +def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>; +def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>; +def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>; +def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>; +def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>; +def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>; +def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>; +def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>; +def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>; +def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>; +def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>; + +def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>; +def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>; +def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>; +def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>; +def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>; +def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>; +def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>; +def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>; +def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>; +def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>; +def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>; +def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>; +def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>; +def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>; +//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>; +//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>; +def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>; +def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>; +def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>; +def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>; + +//let SubtargetPredicate = isCI in { +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 +//} // End isCI + +// TODO: _SRC2_* forms + def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>; def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>; def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>; @@ -744,32 +846,46 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMA //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; //def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; -defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>; -defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <0x00000009, "BUFFER_LOAD_SBYTE", VReg_32>; -defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <0x0000000a, "BUFFER_LOAD_USHORT", VReg_32>; -defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32>; -defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; +defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < + 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < + 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global +>; +defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < + 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < + 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global +>; +defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < + 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < + 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < + 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load +>; def BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "BUFFER_STORE_BYTE", VReg_32 + 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global >; def BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "BUFFER_STORE_SHORT", VReg_32 + 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global >; def BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "BUFFER_STORE_DWORD", VReg_32 + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store >; def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64 + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store >; def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128 + 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store >; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; @@ -885,31 +1001,31 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; //def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; //def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; //def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; -//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>; -//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>; -//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>; -//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>; -//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>; -//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>; -//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>; -//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>; -//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>; -//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>; -//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>; -//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>; -//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>; -//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>; -//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>; -//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>; -//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>; -//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>; -//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>; -//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>; -//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>; -//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>; -//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>; -//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>; -//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>; +defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">; +defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">; +defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">; +defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">; +defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">; +defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">; +defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">; +defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">; +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">; //def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; //def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; //def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; @@ -962,8 +1078,12 @@ defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", [(set i32:$dst, (fp_to_sint f32:$src0))] >; defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; -////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; -//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; +defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32", + [(set i32:$dst, (f32_to_f16 f32:$src0))] +>; +defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", + [(set f32:$dst, (f16_to_f32 i32:$src0))] +>; //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; @@ -973,10 +1093,18 @@ defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64", defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32", [(set f64:$dst, (fextend f32:$src0))] >; -//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>; -//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; -//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; -//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; +defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))] +>; +defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))] +>; +defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))] +>; +defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))] +>; defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64", [(set i32:$dst, (fp_to_uint f64:$src0))] >; @@ -988,7 +1116,7 @@ defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", [(set f32:$dst, (AMDGPUfract f32:$src0))] >; defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", - [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))] + [(set f32:$dst, (ftrunc f32:$src0))] >; defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", [(set f32:$dst, (fceil f32:$src0))] @@ -1006,24 +1134,33 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", [(set f32:$dst, (flog2 f32:$src0))] >; + defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; -defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", + [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))] +>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] + [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))] +>; +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", + [(set f32:$dst, (AMDGPUrsq f32:$src0))] >; -defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] + [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; -defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>; -defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; +defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", + [(set f64:$dst, (AMDGPUrsq f64:$src0))] +>; +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", + [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))] +>; defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", [(set f32:$dst, (fsqrt f32:$src0))] >; @@ -1211,7 +1348,7 @@ defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; -//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; +defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; @@ -1303,16 +1440,20 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", + [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", + [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] +>; -def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64", +def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] >; -def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64", +def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64", [(set i64:$dst, (srl i64:$src0, i32:$src1))] >; -def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64", +def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64", [(set i64:$dst, (sra i64:$src0, i32:$src1))] >; @@ -1336,14 +1477,23 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; } // isCommutable = 1 -defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; -def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; +def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; + +// Double precision division pre-scale. +def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; + +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", + [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", + [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", + [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +>; //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -1500,7 +1650,7 @@ let usesCustomInserter = 1 in { // constant that can be used with the ADDR64 MUBUF instructions. def SI_ADDR64_RSRC : InstSI < (outs SReg_128:$srsrc), - (ins SReg_64:$ptr), + (ins SSrc_64:$ptr), "", [] >; @@ -1508,7 +1658,7 @@ def V_SUB_F64 : InstSI < (outs VReg_64:$dst), (ins VReg_64:$src0, VReg_64:$src1), "V_SUB_F64 $dst, $src0, $src1", - [] + [(set f64:$dst, (fsub f64:$src0, f64:$src1))] >; } // end usesCustomInserter @@ -1529,6 +1679,7 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { } +defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; @@ -1552,7 +1703,7 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< - (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr), + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) >; @@ -1564,11 +1715,6 @@ def : Pat < $src0, $src1, $src2, $src3) >; -def : Pat < - (f64 (fsub f64:$src0, f64:$src1)), - (V_SUB_F64 $src0, $src1) ->; - //===----------------------------------------------------------------------===// // SMRD Patterns //===----------------------------------------------------------------------===// @@ -1596,7 +1742,6 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>; defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; @@ -1615,6 +1760,24 @@ def : Pat < (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; +} // Predicates = [isSI] in { + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI, isCFDepth0] in { + +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_BCNT1_I32_B64 $src), sub0), + (S_MOV_B32 0), sub1) +>; + +} // Predicates = [isSI, isCFDepth0] + +let Predicates = [isSI] in { //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -1625,18 +1788,39 @@ def : Pat < >; //===----------------------------------------------------------------------===// -// VOP2 Patterns +// SOPP Patterns //===----------------------------------------------------------------------===// def : Pat < - (or i64:$src0, i64:$src1), + (int_AMDGPU_barrier_global), + (S_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +def : RcpPat<V_RCP_F32_e32, f32>; +def : RcpPat<V_RCP_F64_e32, f64>; +defm : RsqPat<V_RSQ_F32_e32, f32>; +defm : RsqPat<V_RSQ_F64_e32, f64>; + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +class BinOp64Pat <SDNode node, Instruction inst> : Pat < + (node i64:$src0, i64:$src1), (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0), + (inst (EXTRACT_SUBREG i64:$src0, sub0), (EXTRACT_SUBREG i64:$src1, sub0)), sub0), - (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1), + (inst (EXTRACT_SUBREG i64:$src0, sub1), (EXTRACT_SUBREG i64:$src1, sub1)), sub1) >; +def : BinOp64Pat <or, V_OR_B32_e32>; +def : BinOp64Pat <xor, V_XOR_B32_e32>; + class SextInReg <ValueType vt, int ShiftAmt> : Pat < (sext_inreg i32:$src0, vt), (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0)) @@ -1645,10 +1829,82 @@ class SextInReg <ValueType vt, int ShiftAmt> : Pat < def : SextInReg <i8, 24>; def : SextInReg <i16, 16>; +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e32 $popcnt, $val) +>; + +def : Pat < + (i32 (ctpop i32:$popcnt)), + (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0) +>; + +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1), + (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)), + sub0), + (V_MOV_B32_e32 0), sub1) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ +class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc, $sampler) +>; + +// Only the variants which make sense are defined. +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; + +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; + /* SIsample for simple 1D texture lookup */ def : Pat < (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), @@ -1864,7 +2120,10 @@ def : BitConvert <v2f32, v2i32, VReg_64>; def : BitConvert <v2i32, v2f32, VReg_64>; def : BitConvert <v2i32, i64, VReg_64>; def : BitConvert <i64, v2i32, VReg_64>; - +def : BitConvert <v2f32, i64, VReg_64>; +def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v4f32, v4i32, VReg_128>; def : BitConvert <v4i32, v4f32, VReg_128>; @@ -1894,7 +2153,7 @@ def FCLAMP_SI : AMDGPUShaderInst < } def : Pat < - (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), + (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), (FCLAMP_SI f32:$src) >; @@ -2106,7 +2365,7 @@ def : Pat < (V_MUL_HI_I32 $src0, $src1, (i32 0)) >; -defm : BFIPatterns <V_BFI_B32>; +defm : BFIPatterns <V_BFI_B32, S_MOV_B32>; def : ROTRPattern <V_ALIGNBIT_B32>; /********** ======================= **********/ @@ -2130,7 +2389,7 @@ defm : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; defm : DSReadPat <DS_READ_B32, i32, local_load>; -defm : DSReadPat <DS_READ_B64, i64, local_load>; +defm : DSReadPat <DS_READ_B64, v2i32, local_load>; multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> { def : Pat < @@ -2139,48 +2398,109 @@ multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> { >; def : Pat < - (frag vt:$src1, i32:$src0), - (inst 0, $src0, $src1, 0) + (frag vt:$val, i32:$ptr), + (inst 0, $ptr, $val, 0) >; } defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; defm : DSWritePat <DS_WRITE_B32, i32, local_store>; -defm : DSWritePat <DS_WRITE_B64, i64, local_store>; +defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>; -def : Pat <(atomic_load_add_local i32:$ptr, i32:$val), - (DS_ADD_U32_RTN 0, $ptr, $val, 0)>; - -def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val), - (DS_SUB_U32_RTN 0, $ptr, $val, 0)>; +multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> { + def : Pat < + (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value), + (inst (i1 0), $ptr, $value, (as_i16imm $offset)) + >; -//===----------------------------------------------------------------------===// -// MUBUF Patterns -//===----------------------------------------------------------------------===// + def : Pat < + (frag i32:$ptr, vt:$val), + (inst 0, $ptr, $val, 0) + >; +} -multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, - PatFrag global_ld, PatFrag constant_ld> { +// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec +// +// We need to use something for the data0, so we set a register to +// -1. For the non-rtn variants, the manual says it does +// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max +// will always do the increment so I'm assuming it's the same. +// +// We also load this -1 with s_mov_b32 / s_mov_b64 even though this +// needs to be a VGPR. The SGPR copy pass will fix this, and it's +// easier since there is no v_mov_b64. +multiclass DSAtomicIncRetPat<DS inst, ValueType vt, + Instruction LoadImm, PatFrag frag> { def : Pat < - (vt (global_ld (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset)) + (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)), + (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) >; def : Pat < - (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))), - (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset)) + (frag i32:$ptr, (vt 1)), + (inst 0, $ptr, (LoadImm (vt -1)), 0) >; +} +multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> { def : Pat < - (vt (global_ld i64:$ptr)), - (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0) + (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap), + (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) >; def : Pat < - (vt (global_ld (add i64:$ptr, i64:$offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) + (frag i32:$ptr, vt:$cmp, vt:$swap), + (inst 0, $ptr, $cmp, $swap, 0) >; +} + + +// 32-bit atomics. +defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, + S_MOV_B32, atomic_load_add_local>; +defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, + S_MOV_B32, atomic_load_sub_local>; + +defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; +defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; +defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; +defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; +defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; +defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; + +defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; + +// 64-bit atomics. +defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, + S_MOV_B64, atomic_load_add_local>; +defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, + S_MOV_B64, atomic_load_sub_local>; + +defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; +defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; +defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; +defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; +defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; +defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; + +defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, + PatFrag constant_ld> { def : Pat < (vt (constant_ld (add i64:$ptr, i64:$offset))), (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) @@ -2188,53 +2508,19 @@ multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, } defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, - sextloadi8_global, sextloadi8_constant>; + sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, - az_extloadi8_global, az_extloadi8_constant>; + az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, - sextloadi16_global, sextloadi16_constant>; + sextloadi16_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, - az_extloadi16_global, az_extloadi16_constant>; + az_extloadi16_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, - global_load, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64, - global_load, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64, - az_extloadi32_global, az_extloadi32_constant>; + constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, - global_load, constant_load>; + constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, - global_load, constant_load>; - -multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> { - - def : Pat < - (st vt:$value, (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset)), - (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset)) - >; - - def : Pat < - (st vt:$value, (add i64:$ptr, IMM12bit:$offset)), - (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset)) - >; - - def : Pat < - (st vt:$value, i64:$ptr), - (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0) - >; - - def : Pat < - (st vt:$value, (add i64:$ptr, i64:$offset)), - (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0) - >; -} - -defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>; -defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>; + constant_load>; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, @@ -2301,7 +2587,7 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; -let Predicates = [isCI] in { +let SubtargetPredicate = isCI in { // Sea island new arithmetic instructinos let neverHasSideEffects = 1 in { @@ -2348,7 +2634,7 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; // BUFFER_LOAD_DWORDX3 // BUFFER_STORE_DWORDX3 -} // End Predicates = [isCI] +} // End iSCI /********** ====================== **********/ @@ -2360,13 +2646,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I // 1. Extract with offset def : Pat< (vector_extract vt:$vec, (add i32:$idx, imm:$off)), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) + (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) >; // 2. Extract without offset def : Pat< (vector_extract vt:$vec, i32:$idx), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) + (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) >; // 3. Insert with offset @@ -2392,20 +2678,6 @@ defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>; defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>; defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; -/********** =============== **********/ -/********** Conditions **********/ -/********** =============== **********/ - -def : Pat< - (i1 (setcc f32:$src0, f32:$src1, SETO)), - (V_CMP_O_F32_e64 $src0, $src1) ->; - -def : Pat< - (i1 (setcc f32:$src0, f32:$src1, SETUO)), - (V_CMP_U_F32_e64 $src0, $src1) ->; - //===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// @@ -2439,6 +2711,62 @@ def : Pat < (S_MOV_B32 -1), sub1) >; +class ZExt_i64_i32_Pat <SDNode ext> : Pat < + (i64 (ext i32:$src)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), + (S_MOV_B32 0), sub1) +>; + +class ZExt_i64_i1_Pat <SDNode ext> : Pat < + (i64 (ext i1:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0), + (S_MOV_B32 0), sub1) +>; + + +def : ZExt_i64_i32_Pat<zext>; +def : ZExt_i64_i32_Pat<anyext>; +def : ZExt_i64_i1_Pat<zext>; +def : ZExt_i64_i1_Pat<anyext>; + +def : Pat < + (i64 (sext i32:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), + (S_ASHR_I32 $src, 31), sub1) +>; + +def : Pat < + (i64 (sext i1:$src)), + (INSERT_SUBREG + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), + (V_CNDMASK_B32_e64 0, -1, $src), sub0), + (V_CNDMASK_B32_e64 0, -1, $src), sub1) +>; + +def : Pat < + (f32 (sint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) +>; + +def : Pat < + (f32 (uint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) +>; + +def : Pat < + (f64 (sint_to_fp i1:$src)), + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) +>; + +def : Pat < + (f64 (uint_to_fp i1:$src)), + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) +>; + //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 00e32c0..df690a4 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -56,11 +56,61 @@ let TargetPrefix = "SI", isTarget = 1 in { class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + // Fully-flexible SAMPLE instruction. + class SampleRaw : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v32i8_ty, // rsrc(SGPR) + llvm_v16i8_ty, // sampler(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + def int_SI_sample : Sample; def int_SI_sampleb : Sample; def int_SI_sampled : Sample; def int_SI_samplel : Sample; + // Basic gather4 + def int_SI_gather4 : SampleRaw; + def int_SI_gather4_cl : SampleRaw; + def int_SI_gather4_l : SampleRaw; + def int_SI_gather4_b : SampleRaw; + def int_SI_gather4_b_cl : SampleRaw; + def int_SI_gather4_lz : SampleRaw; + + // Gather4 with comparison + def int_SI_gather4_c : SampleRaw; + def int_SI_gather4_c_cl : SampleRaw; + def int_SI_gather4_c_l : SampleRaw; + def int_SI_gather4_c_b : SampleRaw; + def int_SI_gather4_c_b_cl : SampleRaw; + def int_SI_gather4_c_lz : SampleRaw; + + // Gather4 with offsets + def int_SI_gather4_o : SampleRaw; + def int_SI_gather4_cl_o : SampleRaw; + def int_SI_gather4_l_o : SampleRaw; + def int_SI_gather4_b_o : SampleRaw; + def int_SI_gather4_b_cl_o : SampleRaw; + def int_SI_gather4_lz_o : SampleRaw; + + // Gather4 with comparison and offsets + def int_SI_gather4_c_o : SampleRaw; + def int_SI_gather4_c_cl_o : SampleRaw; + def int_SI_gather4_c_l_o : SampleRaw; + def int_SI_gather4_c_b_o : SampleRaw; + def int_SI_gather4_c_b_cl_o : SampleRaw; + def int_SI_gather4_c_lz_o : SampleRaw; + + def int_SI_getlod : SampleRaw; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 6601f2a..9f5ff29 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -86,6 +86,7 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); + void InitM0ForLDS(MachineBasicBlock::iterator MI); void LoadM0(MachineInstr &MI, MachineInstr *MovRel); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -320,6 +321,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } +/// The m0 register stores the maximum allowable address for LDS reads and +/// writes. Its value must be at least the size in bytes of LDS allocated by +/// the shader. For simplicity, we set it to the maximum possible value. +void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::M0).addImm(0xffffffff); +} + void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); @@ -333,52 +342,57 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addReg(Idx); MBB.insert(I, MovRel); - MI.eraseFromParent(); - return; - } + } else { - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VReg_32RegClass.contains(Idx)); + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VReg_32RegClass.contains(Idx)); - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + AMDGPU::VCC_LO) + .addReg(Idx); - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) + .addReg(AMDGPU::M0) + .addReg(Idx); - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); - // Do the actual move - MBB.insert(I, MovRel); + // Do the actual move + MBB.insert(I, MovRel); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7) + .addReg(AMDGPU::EXEC); - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + } + // FIXME: Are there any values other than the LDS address clamp that need to + // be stored in the m0 register and may be live for more than a few + // instructions? If so, we should save the m0 register at the beginning + // of this function and restore it here. + // FIXME: Add support for LDS direct loads. + InitM0ForLDS(&MI); MI.eraseFromParent(); } @@ -523,8 +537,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock &MBB = MF.front(); // Initialize M0 to a value that won't cause LDS access to be discarded // due to offset clamping - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::M0).addImm(0xffffffff); + InitM0ForLDS(MBB.getFirstNonPHI()); } if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) { diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index af60995..e2df950 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -62,8 +62,10 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { return VGPR; } } - MF->getFunction()->getContext().emitError( - "Could not found S_ENGPGM instrtuction."); + + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Could not find S_ENDPGM instruction."); + return VGPR; } diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index c72d549..d0b677a 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -14,21 +14,20 @@ #include "SIRegisterInfo.h" -#include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" using namespace llvm; -SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm) -: AMDGPURegisterInfo(tm), - TM(tm) +SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) +: AMDGPURegisterInfo(st) { } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); TII->reserveIndirectRegisters(Reserved, MF); return Reserved; } @@ -38,15 +37,6 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return RC->getNumRegs(); } -const TargetRegisterClass * -SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { - switch (rc->getID()) { - case AMDGPU::GPRF32RegClassID: - return &AMDGPU::VReg_32RegClass; - default: return rc; - } -} - const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { @@ -135,3 +125,19 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, unsigned Index = getHWRegIndex(Reg); return SubRC->getRegister(Index + Channel); } + +bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const { + switch (RCID) { + default: return false; + case AMDGPU::SSrc_32RegClassID: + case AMDGPU::SSrc_64RegClassID: + case AMDGPU::VSrc_32RegClassID: + case AMDGPU::VSrc_64RegClassID: + return true; + } +} + +bool SIRegisterInfo::regClassCanUseImmediate( + const TargetRegisterClass *RC) const { + return regClassCanUseImmediate(RC->getID()); +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 36b4fcd..c9305fb 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -20,24 +20,15 @@ namespace llvm { -class AMDGPUTargetMachine; - struct SIRegisterInfo : public AMDGPURegisterInfo { - AMDGPUTargetMachine &TM; - SIRegisterInfo(AMDGPUTargetMachine &tm); + SIRegisterInfo(const AMDGPUSubtarget &st); BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - /// \param RC is an AMDIL reg class. - /// - /// \returns the SI register class that is equivalent to \p RC. - const TargetRegisterClass * - getISARegClass(const TargetRegisterClass *RC) const override; - /// \brief get the register class of the specified type to use in the /// CFGStructurizer const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; @@ -69,6 +60,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \returns The sub-register of Reg that is in Channel. unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const; + + /// \returns True if operands defined with this register class can accept + /// inline immediates. + bool regClassCanUseImmediate(int RCID) const; + + /// \returns True if operands defined with this register class can accept + /// inline immediates. + bool regClassCanUseImmediate(const TargetRegisterClass *RC) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index f1f01de..8974b63 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, (add SGPR_64Regs, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index a0b6907..367963a 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -119,8 +119,7 @@ void SITypeRewriter::visitCallInst(CallInst &I) { Type::getInt32Ty(I.getContext())){ Type *ElementTy = Arg->getType()->getVectorElementType(); std::string TypeName = "i32"; - InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg); - assert(Def); + InsertElementInst *Def = cast<InsertElementInst>(Arg); Args.push_back(Def->getOperand(1)); Types.push_back(ElementTy); std::string VecTypeName = "v1" + TypeName; |