diff options
Diffstat (limited to 'lib/Target/R600')
88 files changed, 9539 insertions, 3659 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 713fc4b..261075e 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPU_H -#define AMDGPU_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H +#define LLVM_LIB_TARGET_R600_AMDGPU_H #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" @@ -39,6 +39,8 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIShrinkInstructionsPass(); +FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); FunctionPass *createSIFixSGPRLiveRangesPass(); @@ -48,10 +50,14 @@ FunctionPass *createSIInsertWaits(TargetMachine &tm); void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILoadStoreOptimizerPass(PassRegistry &); +extern char &SILoadStoreOptimizerID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +ModulePass *createAMDGPUAlwaysInlinePass(); /// \brief Creates an AMDGPU-specific Target Transformation Info pass. ImmutablePass * @@ -63,6 +69,14 @@ extern char &SIFixSGPRLiveRangesID; extern Target TheAMDGPUTarget; +namespace AMDGPU { +enum TargetIndex { + TI_CONSTDATA_START +}; +} + +#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" + } // End namespace llvm namespace ShaderType { @@ -118,4 +132,4 @@ enum AddressSpaces { } // namespace AMDGPUAS -#endif // AMDGPU_H +#endif diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 6ff9ab7..4cf1243 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -25,6 +25,11 @@ def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", "false", "Disable IR Structurizer">; +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass">; + // Target features def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", @@ -37,6 +42,20 @@ def FeatureFP64 : SubtargetFeature<"fp64", "true", "Enable double precision operations">; +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64]>; + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling">; + def Feature64BitPtr : SubtargetFeature<"64BitPtr", "Is64bit", "true", @@ -62,6 +81,17 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug", "true", "GPU has CF_ALU bug">; +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass">; + +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", @@ -111,19 +141,28 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", >; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; } +def AMDGPUAsmParser : AsmParser { + // Some of the R600 registers have the same name, so this crashes. + // For example T0_XYZW and T0_XY both have the asm name T0. + let ShouldEmitMatchRegisterName = 0; +} + def AMDGPU : Target { // Pull in Instruction Info: let InstructionSet = AMDGPUInstrInfo; + let AssemblyParsers = [AMDGPUAsmParser]; } // Dummy Instruction itineraries for pseudo instructions diff --git a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp new file mode 100644 index 0000000..b545b45 --- /dev/null +++ b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp @@ -0,0 +1,66 @@ +//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass marks all internal functions as always_inline and creates +/// duplicates of all other functions a marks the duplicates as always_inline. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +class AMDGPUAlwaysInline : public ModulePass { + + static char ID; + +public: + AMDGPUAlwaysInline() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } +}; + +} // End anonymous namespace + +char AMDGPUAlwaysInline::ID = 0; + +bool AMDGPUAlwaysInline::runOnModule(Module &M) { + + std::vector<Function*> FuncsToClone; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty()) + FuncsToClone.push_back(&F); + } + + for (Function *F : FuncsToClone) { + ValueToValueMapTy VMap; + Function *NewFunc = CloneFunction(F, VMap, false); + NewFunc->setLinkage(GlobalValue::InternalLinkage); + F->getParent()->getFunctionList().push_back(NewFunc); + F->replaceAllUsesWith(NewFunc); + } + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (F.hasLocalLinkage()) { + F.addFnAttr(Attribute::AlwaysInline); + } + } + return false; +} + +ModulePass *llvm::createAMDGPUAlwaysInlinePass() { + return new AMDGPUAlwaysInline(); +} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index a6e217b..5511d7c 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -16,7 +16,6 @@ //===----------------------------------------------------------------------===// // - #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" @@ -26,6 +25,7 @@ #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -48,11 +48,28 @@ using namespace llvm; // precision, and leaves single precision to flush all and does not report // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports // CL_FP_DENORM for both. -static uint32_t getFPMode(MachineFunction &) { +// +// FIXME: It seems some instructions do not support single precision denormals +// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, +// and sin_f32, cos_f32 on most parts). + +// We want to use these instructions, and using fp32 denormals also causes +// instructions to run at the double precision rate for the device so it's +// probably best to just report no single precision denormals. +static uint32_t getFPMode(const MachineFunction &F) { + const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>(); + // TODO: Is there any real use for the flush in only / flush out only modes? + + uint32_t FP32Denormals = + ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + uint32_t FP64Denormals = + ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | - FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) | - FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE); + FP_DENORM_MODE_SP(FP32Denormals) | + FP_DENORM_MODE_DP(FP64Denormals); } static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, @@ -69,10 +86,24 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode(); } +void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { + + // This label is used to mark the end of the .text section. + const TargetLoweringObjectFile &TLOF = getObjFileLowering(); + OutStreamer.SwitchSection(TLOF.getTextSection()); + MCSymbol *EndOfTextLabel = + OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + OutStreamer.EmitLabel(EndOfTextLabel); +} + bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + + // The starting address of all shader programs must be 256 bytes aligned. + MF.setAlignment(8); + SetupMachineFunction(MF); - OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':')); + EmitFunctionHeader(); MCContext &Context = getObjFileLowering().getContext(); const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", @@ -115,6 +146,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), false); + OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer.emitRawComment( @@ -145,25 +178,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } -void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { +void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const R600RegisterInfo * RI = - static_cast<const R600RegisterInfo*>(TM.getRegisterInfo()); - R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { if (MI.getOpcode() == AMDGPU::KILLGT) killPixel = true; unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - MachineOperand & MO = MI.getOperand(op_idx); + const MachineOperand &MO = MI.getOperand(op_idx); if (!MO.isReg()) continue; unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; @@ -179,7 +208,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { unsigned RsrcReg; if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MFI->ShaderType) { + switch (MFI->getShaderType()) { default: // Fall through case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; @@ -188,7 +217,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { } } else { // R600 / R700 - switch (MFI->ShaderType) { + switch (MFI->getShaderType()) { default: // Fall through case ShaderType::GEOMETRY: // Fall through case ShaderType::COMPUTE: // Fall through @@ -203,34 +232,30 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (MFI->ShaderType == ShaderType::COMPUTE) { + if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); } } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - MachineFunction &MF) const { + const MachineFunction &MF) const { uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; - const SIRegisterInfo * RI = - static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; + bool FlatUsed = false; + const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. CodeSize += MI.getDesc().Size; unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - MachineOperand &MO = MI.getOperand(op_idx); + const MachineOperand &MO = MI.getOperand(op_idx); unsigned width = 0; bool isSGPR = false; @@ -242,6 +267,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, reg == AMDGPU::VCC_HI) { VCCUsed = true; continue; + } else if (reg == AMDGPU::FLAT_SCR || + reg == AMDGPU::FLAT_SCR_LO || + reg == AMDGPU::FLAT_SCR_HI) { + FlatUsed = true; + continue; } switch (reg) { @@ -302,8 +332,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; - ProgInfo.NumVGPR = MaxVGPR; - ProgInfo.NumSGPR = MaxSGPR; + if (FlatUsed) + MaxSGPR += 2; + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR = MaxSGPR + 1; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. @@ -315,16 +350,21 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Do not clamp NAN to 0. ProgInfo.DX10Clamp = 0; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + + ProgInfo.FlatUsed = FlatUsed; + ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; } -void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg; - switch (MFI->ShaderType) { + switch (MFI->getShaderType()) { default: // Fall through case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; @@ -341,15 +381,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, LDSAlignShift = 9; } - unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + unsigned LDSSpillSize = MFI->LDSWaveSpillSize * + MFI->getMaximumWorkGroupSize(MF); - if (MFI->ShaderType == ShaderType::COMPUTE) { + unsigned LDSBlocks = + RoundUpToAlignment(MFI->LDSSize + LDSSpillSize, + 1 << LDSAlignShift) >> LDSAlignShift; + + // Scratch is allocated in 256 dword blocks. + unsigned ScratchAlignShift = 10; + // We need to program the hardware with the amount of scratch memory that + // is used by the entire wave. KernelInfo.ScratchSize is the amount of + // scratch memory used per thread. + unsigned ScratchBlocks = + RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), + 1 << ScratchAlignShift) >> ScratchAlignShift; + + unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4; + unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8; + + if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); const uint32_t ComputePGMRSrc1 = - S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | + S_00B848_VGPRS(VGPRBlocks) | + S_00B848_SGPRS(SGPRBlocks) | S_00B848_PRIORITY(KernelInfo.Priority) | S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | S_00B848_PRIV(KernelInfo.Priv) | @@ -360,14 +416,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + const uint32_t ComputePGMRSrc2 = + S_00B84C_LDS_SIZE(LDSBlocks) | + S_00B02C_SCRATCH_EN(ScratchBlocks > 0); + + OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); + + OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); + + // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = + // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) | + S_00B028_SGPRS(SGPRBlocks), 4); } - if (MFI->ShaderType == ShaderType::PIXEL) { + if (MFI->getShaderType() == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index c1acb6e..b9a0767 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -12,11 +12,10 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPU_ASMPRINTER_H -#define AMDGPU_ASMPRINTER_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H #include "llvm/CodeGen/AsmPrinter.h" -#include <string> #include <vector> namespace llvm { @@ -33,6 +32,9 @@ private: DX10Clamp(0), DebugMode(0), IEEEMode(0), + ScratchSize(0), + FlatUsed(false), + VCCUsed(false), CodeLen(0) {} // Fields set in PGM_RSRC1 pm4 packet. @@ -44,20 +46,24 @@ private: uint32_t DX10Clamp; uint32_t DebugMode; uint32_t IEEEMode; + uint32_t ScratchSize; + + bool FlatUsed; // Bonus information for debugging. + bool VCCUsed; uint64_t CodeLen; }; - void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const; - void findNumUsedRegistersSI(MachineFunction &MF, + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void findNumUsedRegistersSI(const MachineFunction &MF, unsigned &NumSGPR, unsigned &NumVGPR) const; /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. - void EmitProgramInfoR600(MachineFunction &MF); - void EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitProgramInfoR600(const MachineFunction &MF); + void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer); @@ -71,6 +77,8 @@ public: /// Implemented in AMDGPUMCInstLower.cpp void EmitInstruction(const MachineInstr *MI) override; + void EmitEndOfAsmFile(Module &M) override; + protected: bool DisasmEnabled; std::vector<std::string> DisasmLines, HexLines; @@ -79,4 +87,4 @@ protected: } // End anonymous llvm -#endif //AMDGPU_ASMPRINTER_H +#endif diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 5f8ad8c..6ffa7a0 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -59,16 +59,24 @@ def CC_AMDGPU_Kernel : CallingConv<[ ]>; def CC_AMDGPU : CallingConv<[ - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"# - "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>, - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->" - "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>, - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"# - ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>, - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"# - ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>> + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >=" + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_SI>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_R600>> ]>; diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h index d18ede5..15a6636 100644 --- a/lib/Target/R600/AMDGPUFrameLowering.h +++ b/lib/Target/R600/AMDGPUFrameLowering.h @@ -12,8 +12,8 @@ /// machine. // //===----------------------------------------------------------------------===// -#ifndef AMDILFRAME_LOWERING_H -#define AMDILFRAME_LOWERING_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H #include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetFrameLowering.h" @@ -42,4 +42,4 @@ public: bool hasFP(const MachineFunction &MF) const override; }; } // namespace llvm -#endif // AMDILFRAME_LOWERING_H +#endif diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index b4d79e5..90b6672 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -16,9 +16,13 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "SIDefines.h" #include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Function.h" @@ -61,6 +65,7 @@ private: static bool checkPrivateAddress(const MachineMemOperand *Op); static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); static bool isPrivateStore(const StoreSDNode *N); static bool isLocalStore(const StoreSDNode *N); static bool isRegionStore(const StoreSDNode *N); @@ -68,24 +73,46 @@ private: bool isCPLoad(const LoadSDNode *N) const; bool isConstantLoad(const LoadSDNode *N, int cbID) const; bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; bool isParamLoad(const LoadSDNode *N) const; bool isPrivateLoad(const LoadSDNode *N) const; bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; - /// \returns True if the current basic block being selected is at control - /// flow depth 0. Meaning that the current block dominates the - // exit block. - bool isCFDepth0() const; - const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, - SDValue &ImmOffset) const; + bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; + bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &Offset) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &Offset, + SDValue &SLC) const; + bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset, SDValue &GLC) const; + SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Omod) const; SDNode *SelectADD_SUB_I64(SDNode *N); SDNode *SelectDIV_SCALE(SDNode *N); @@ -125,7 +152,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { - const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode()); + const MCInstrDesc &Desc = + TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; @@ -133,15 +161,17 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, if (RegClass == -1) return nullptr; - return TM.getRegisterInfo()->getRegClass(RegClass); + return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass); } case AMDGPU::REG_SEQUENCE: { unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID); + const TargetRegisterClass *SuperRC = + TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID); SDValue SubRegOp = N->getOperand(OpNo + 1); unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); - return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx); + return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg( + SuperRC, SubRegIdx); } } } @@ -229,10 +259,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; - const AMDGPURegisterInfo *TRI = - static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo()); - const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); + const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); @@ -460,7 +490,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::DIV_SCALE: { return SelectDIV_SCALE(N); } + case ISD::CopyToReg: { + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + Lowering.legalizeTargetIndependentNode(N, *CurDAG); + break; + } + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); } + return SelectCode(N); } @@ -498,6 +537,10 @@ bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } @@ -529,6 +572,10 @@ bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } @@ -558,23 +605,16 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { const Value *MemVal = N->getMemOperand()->getValue(); if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){ + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { return true; } return false; } -bool AMDGPUDAGToDAGISel::isCFDepth0() const { - // FIXME: Figure out a way to use DominatorTree analysis here. - const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock(); - const Function *Fn = FuncInfo->Fn; - return &Fn->front() == CurBlock || &Fn->back() == CurBlock; -} - - const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -677,14 +717,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; - unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32; + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - if (!isCFDepth0()) { - Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32; - CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32; - } - SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); SDValue Carry(AddLo, 1); SDNode *AddHi @@ -711,71 +746,401 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); - + const SDValue False = CurDAG->getTargetConstant(0, MVT::i1); SDValue Ops[] = { - N->getOperand(0), - N->getOperand(1), - N->getOperand(2), - Zero, - Zero, - Zero, - Zero + Zero, // src0_modifiers + N->getOperand(0), // src0 + Zero, // src1_modifiers + N->getOperand(1), // src1 + Zero, // src2_modifiers + N->getOperand(2), // src2 + False, // clamp + Zero // omod }; return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); } -static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { - return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32, - Ptr), 0); +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const { + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + +bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + // (add n0, c0) + Base = N0; + Offset = N1; + return true; + } + } + + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. + if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + if (isUInt<16>(CAddr->getZExtValue())) { + SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(Addr), MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset = Addr; + return true; + } + } + + // default case + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + unsigned DWordOffset0 = C1->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + // (add n0, c0) + if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + Base = N0; + Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + return true; + } + } + + if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + unsigned DWordOffset0 = CAddr->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + assert(4 * DWordOffset0 == CAddr->getZExtValue()); + + if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + MachineSDNode *MovZero + = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(Addr), MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + return true; + } + } + + // default case + Base = Addr; + Offset0 = CurDAG->getTargetConstant(0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, MVT::i8); + return true; +} + +static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { + return isUInt<12>(Imm->getZExtValue()); } -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, - SDValue &Offset, - SDValue &ImmOffset) const { +void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { SDLoc DL(Addr); + GLC = CurDAG->getTargetConstant(0, MVT::i1); + SLC = CurDAG->getTargetConstant(0, MVT::i1); + TFE = CurDAG->getTargetConstant(0, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, MVT::i1); + Offen = CurDAG->getTargetConstant(0, MVT::i1); + Addr64 = CurDAG->getTargetConstant(0, MVT::i1); + SOffset = CurDAG->getTargetConstant(0, MVT::i32); + if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (isUInt<12>(C1->getZExtValue())) { + if (isLegalMUBUFImmOffset(C1)) { if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) + // (add (add N2, N3), C1) -> addr64 SDValue N2 = N0.getOperand(0); SDValue N3 = N0.getOperand(1); - Ptr = wrapAddr64Rsrc(CurDAG, DL, N2); - Offset = N3; - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); - return true; + Addr64 = CurDAG->getTargetConstant(1, MVT::i1); + Ptr = N2; + VAddr = N3; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return; } - // (add N0, C1) - Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));; - Offset = N0; - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); - return true; + // (add N0, C1) -> offset + VAddr = CurDAG->getTargetConstant(0, MVT::i32); + Ptr = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return; } } if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) + // (add N0, N1) -> addr64 SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - Ptr = wrapAddr64Rsrc(CurDAG, DL, N0); - Offset = N1; - ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + Addr64 = CurDAG->getTargetConstant(1, MVT::i1); + Ptr = N0; + VAddr = N1; + Offset = CurDAG->getTargetConstant(0, MVT::i16); + return; + } + + // default case -> offset + VAddr = CurDAG->getTargetConstant(0, MVT::i32); + Ptr = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i16); + +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, + SDValue &Offset) const { + SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE; + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + ConstantSDNode *C = cast<ConstantSDNode>(Addr64); + if (C->getSExtValue()) { + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); return true; } - // default case - Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64)); - Offset = Addr; + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &Offset, + SDValue &SLC) const { + SLC = CurDAG->getTargetConstant(0, MVT::i1); + + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, + ScratchOffsetReg, MVT::i32); + + SDValue ScratchPtr = + CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64); + Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); + SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + + // (add n0, c1) + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + if (isLegalMUBUFImmOffset(C1)) { + VAddr = Addr.getOperand(0); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + } + + // (add FI, n0) + if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + isa<FrameIndexSDNode>(Addr.getOperand(0))) { + VAddr = Addr.getOperand(1); + ImmOffset = Addr.getOperand(0); + return true; + } + + // (FI) + if (isa<FrameIndexSDNode>(Addr)) { + VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, + CurDAG->getConstant(0, MVT::i32)), 0); + ImmOffset = Addr; + return true; + } + + // (node) + VAddr = Addr; ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); return true; } +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &SOffset, SDValue &Offset, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDValue Ptr, VAddr, Offen, Idxen, Addr64; + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + if (!cast<ConstantSDNode>(Offen)->getSExtValue() && + !cast<ConstantSDNode>(Idxen)->getSExtValue() && + !cast<ConstantSDNode>(Addr64)->getSExtValue()) { + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | + APInt::getAllOnesValue(32).getZExtValue(); // Size + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset, + SDValue &GLC) const { + SDValue SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} + +// FIXME: This is incorrect and only enough to be able to compile. +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); + SDLoc DL(N); + + assert(Subtarget.hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && + "Cannot cast address space to / from constant address!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32)); + } + + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32); + + const SDValue Ops[] = { + RC, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, + CurDAG->getConstant(0, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + SDLoc(N), N->getValueType(0), Ops); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + + unsigned Mods = 0; + + Src = In; + + if (Src.getOpcode() == ISD::FNEG) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } + + if (Src.getOpcode() == ISD::FABS) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } + + SrcMods = CurDAG->getTargetConstant(Mods, MVT::i32); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + // FIXME: Handle Clamp and Omod + Clamp = CurDAG->getTargetConstant(0, MVT::i32); + Omod = CurDAG->getTargetConstant(0, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Omod) const { + // FIXME: Handle Omod + Omod = CurDAG->getTargetConstant(0, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 0ada7a3..2f95b74 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -21,7 +21,6 @@ #include "AMDGPUSubtarget.h" #include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -104,7 +103,7 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { } AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : - TargetLowering(TM, new TargetLoweringObjectFileELF()) { + TargetLowering(TM) { Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); @@ -131,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -242,6 +244,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); } + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { setOperationAction(ISD::SREM, VT, Expand); @@ -271,15 +279,23 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::ROTL, MVT::i64, Expand); setOperationAction(ISD::ROTR, MVT::i64, Expand); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i64, Expand); setOperationAction(ISD::MULHS, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + if (!Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + + if (!Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -300,7 +316,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::SUB, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); - // TODO: Implement custom UREM / SREM routines. setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); @@ -332,12 +347,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : for (MVT VT : FloatVectorTypes) { setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FMINNUM, VT, Expand); + setOperationAction(ISD::FMAXNUM, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -360,21 +378,25 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(false); + setSelectIsExpensive(false); PredictableSelectIsExpensive = false; // There are no integer divide instructions, and these expand to a pretty // large sequence of instructions. setIntDivIsCheap(false); - setPow2DivIsCheap(false); - - // TODO: Investigate this when 64-bit divides are implemented. - addBypassSlowDiv(64, 32); + setPow2SDivIsCheap(false); // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; @@ -426,12 +448,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32; + return VT == MVT::f32 || VT == MVT::f64; } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32; + return VT == MVT::f32 || VT == MVT::f64; } bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { @@ -531,16 +553,18 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::SREM: return LowerSREM(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FREM: return LowerFREM(Op, DAG); case ISD::FCEIL: return LowerFCEIL(Op, DAG); case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); } return Op; } @@ -595,7 +619,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getDataLayout(); + const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); SDLoc DL(InitPtr); Type *InitTy = Init->getType(); @@ -668,22 +692,35 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, llvm_unreachable("Unhandled constant initializer"); } +static bool hasDefinedInitializer(const GlobalValue *GV) { + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + if (!GVar || !GVar->hasInitializer()) + return false; + + if (isa<UndefValue>(GVar->getInitializer())) + return false; + + return true; +} + SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getDataLayout(); + const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); switch (G->getAddressSpace()) { - default: llvm_unreachable("Global Address lowering not implemented for this " - "address space"); case AMDGPUAS::LOCAL_ADDRESS: { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); + // TODO: We could emit code to handle the initialization somewhere. + if (hasDefinedInitializer(GV)) + break; + unsigned Offset; if (MFI->LocalMemoryObjects.count(GV) == 0) { uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); @@ -695,7 +732,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, Offset = MFI->LocalMemoryObjects[GV]; } - return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); + return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); } case AMDGPUAS::CONSTANT_ADDRESS: { MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); @@ -737,6 +774,12 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); } } + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadInit(Fn, + "initializer for address space"); + DAG.getContext()->diagnose(BadInit); + return SDValue(); } SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, @@ -767,8 +810,8 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + getTargetMachine().getSubtargetImpl()->getFrameLowering()); FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); @@ -810,13 +853,21 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // first parameter must be the same as the first instruction. SDValue Numerator = Op.getOperand(1); SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; - return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, - Src0, Denominator, Numerator); + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); } case Intrinsic::AMDGPU_div_fmas: + // FIXME: Dropping bool parameter. Work is needed to support the implicit + // read from VCC. return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -840,6 +891,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::AMDGPU_rsq_clamped: return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + case Intrinsic::AMDGPU_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -945,21 +1000,16 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, } /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, - SelectionDAG &DAG) const { - SDLoc DL(N); - EVT VT = N->getValueType(0); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue False = N->getOperand(3); - SDValue CC = N->getOperand(4); - - if (VT != MVT::f32 || - !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { +SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); - } ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); switch (CCOpcode) { @@ -975,15 +1025,19 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, case ISD::SETTRUE2: case ISD::SETUO: case ISD::SETO: - llvm_unreachable("Operation should already be optimised!"); + break; case ISD::SETULE: case ISD::SETULT: case ISD::SETOLE: case ISD::SETOLT: case ISD::SETLE: case ISD::SETLT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); } case ISD::SETGT: case ISD::SETGE: @@ -991,8 +1045,9 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, case ISD::SETOGE: case ISD::SETUGT: case ISD::SETOGT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); } case ISD::SETCC_INVALID: llvm_unreachable("Invalid setcc condcode!"); @@ -1000,12 +1055,53 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, return SDValue(); } -SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, - SelectionDAG &DAG) const { - LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); - EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + switch (CCOpcode) { + case ISD::SETULE: + case ISD::SETULT: { + unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETLE: + case ISD::SETLT: { + unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: { + unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast<LoadSDNode>(Op); + EVT MemVT = Load->getMemoryVT(); + EVT MemEltVT = MemVT.getVectorElementType(); + EVT LoadVT = Op.getValueType(); - EVT EltVT = Op.getValueType().getVectorElementType(); + EVT EltVT = LoadVT.getVectorElementType(); EVT PtrVT = Load->getBasePtr().getValueType(); unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); @@ -1013,17 +1109,19 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, SmallVector<SDValue, 8> Chains; SDLoc SL(Op); + unsigned MemEltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); - for (unsigned i = 0, e = NumElts; i != e; ++i) { + for (unsigned i = 0; i < NumElts; ++i) { SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); + DAG.getConstant(i * MemEltSize, PtrVT)); SDValue NewLoad = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, Load->getChain(), Ptr, - MachinePointerInfo(Load->getMemOperand()->getValue()), + SrcValue.getWithOffset(i * MemEltSize), MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->getAlignment()); + Load->isInvariant(), Load->getAlignment()); Loads.push_back(NewLoad.getValue(0)); Chains.push_back(NewLoad.getValue(1)); } @@ -1036,6 +1134,55 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, return DAG.getMergeValues(Ops, SL); } +SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorLoad(Op, DAG); + + LoadSDNode *Load = cast<LoadSDNode>(Op); + SDValue BasePtr = Load->getBasePtr(); + EVT PtrVT = BasePtr.getValueType(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + SDValue LoLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, + SrcValue, + LoMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); + + SDValue HiLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, + Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue Ops[] = { + DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1)) + }; + + return DAG.getMergeValues(Ops, SL); +} + SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -1094,8 +1241,8 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, Store->getAlignment()); } -SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, + SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); EVT EltVT = Store->getValue().getValueType().getVectorElementType(); @@ -1105,21 +1252,77 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SmallVector<SDValue, 8> Chains; + unsigned EltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Store->getValue(), DAG.getConstant(i, MVT::i32)); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, - Store->getBasePtr(), - DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), - PtrVT)); - Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, - MachinePointerInfo(Store->getMemOperand()->getValue()), - MemEltVT, Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment())); + Store->getValue(), + DAG.getConstant(i, MVT::i32)); + + SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); + SDValue NewStore = + DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, + SrcValue.getWithOffset(i * EltSize), + MemEltVT, Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + Chains.push_back(NewStore); } + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); } +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Val = Store->getValue(); + EVT VT = Val.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorStore(Op, DAG); + + EVT MemVT = Store->getMemoryVT(); + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + SDLoc SL(Op); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + + EVT PtrVT = BasePtr.getValueType(); + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); + + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + SDValue LoStore + = DAG.getTruncStore(Chain, SL, Lo, + BasePtr, + SrcValue, + LoMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + SDValue HiStore + = DAG.getTruncStore(Chain, SL, Hi, + HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); +} + + SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -1165,22 +1368,8 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } - // Lower loads constant address space global variable loads - if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa<GlobalVariable>( - GetUnderlyingObject(Load->getMemOperand()->getValue()))) { - - - SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); - } - - if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || + Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) return SDValue(); @@ -1231,7 +1420,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { - return SplitVectorStore(Op, DAG); + return ScalarizeVectorStore(Op, DAG); } EVT MemVT = Store->getMemoryVT(); @@ -1276,249 +1465,179 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { +// This is a shortcut for integer division because we have fast i32<->f32 +// conversions, and fast f32 reciprocal instructions. The fractional part of a +// float is enough to accurately represent up to a 24-bit integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { SDLoc DL(Op); - EVT OVT = Op.getValueType(); + EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - MVT INTTY; - MVT FLTTY; - if (!OVT.isVector()) { - INTTY = MVT::i32; - FLTTY = MVT::f32; - } else if (OVT.getVectorNumElements() == 2) { - INTTY = MVT::v2i32; - FLTTY = MVT::v2f32; - } else if (OVT.getVectorNumElements() == 4) { - INTTY = MVT::v4i32; - FLTTY = MVT::v4f32; - } - unsigned bitsize = OVT.getScalarType().getSizeInBits(); - // char|short jq = ia ^ ib; - SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); - - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); - - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + MVT IntVT = MVT::i32; + MVT FltVT = MVT::f32; + + ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + + if (VT.isVector()) { + unsigned NElts = VT.getVectorNumElements(); + IntVT = MVT::getVectorVT(MVT::i32, NElts); + FltVT = MVT::getVectorVT(MVT::f32, NElts); + } + + unsigned BitSize = VT.getScalarType().getSizeInBits(); + + SDValue jq = DAG.getConstant(1, IntVT); + + if (sign) { + // char|short jq = ia ^ ib; + jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + } // int ia = (int)LHS; - SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + SDValue ia = sign ? + DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); // int ib, (int)RHS; - SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + SDValue ib = sign ? + DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); // float fa = (float)ia; - SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); // float fb = (float)ib; - SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY, - fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb)); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); // fq = trunc(fq); - fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); // float fqneg = -fq; - SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, - DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, + DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); // int iq = (int)fq; - SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); // fr = fabs(fr); - fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); // fb = fabs(fb); - fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); + fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); - // int cv = fr >= fb; - SDValue cv; - if (INTTY == MVT::i32) { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } else { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } - // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, - DAG.getConstant(0, OVT)); - // dst = iq + jq; - iq = DAG.getSExtOrTrunc(iq, DL, OVT); - iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); - return iq; -} - -SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSDIV32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r0, r0, r1 - // ixor r10, r10, r11 - // iadd r0, r0, r10 - // ixor DST, r0, r10 - - // mov r0, LHS - SDValue r0 = LHS; - - // mov r1, RHS - SDValue r1 = RHS; - - // ilt r10, r0, 0 - SDValue r10 = DAG.getSelectCC(DL, - r0, DAG.getConstant(0, OVT), - DAG.getConstant(-1, OVT), - DAG.getConstant(0, OVT), - ISD::SETLT); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); - // ilt r11, r1, 0 - SDValue r11 = DAG.getSelectCC(DL, - r1, DAG.getConstant(0, OVT), - DAG.getConstant(-1, OVT), - DAG.getConstant(0, OVT), - ISD::SETLT); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); - - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + // int cv = fr >= fb; + SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); - // udiv r0, r0, r1 - r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT)); - // ixor r10, r10, r11 - r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + // dst = trunc/extend to legal type + iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + // dst = iq + jq; + SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} + // Rem needs compensation, it's easier to recompute it + SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); + Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); -SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); } -SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType().getScalarType(); - - if (OVT == MVT::i64) - return LowerSDIV64(Op, DAG); - - if (OVT.getScalarType() == MVT::i32) - return LowerSDIV32(Op, DAG); - - if (OVT == MVT::i16 || OVT == MVT::i8) { - // FIXME: We should be checking for the masked bits. This isn't reached - // because i8 and i16 are not legal types. - return LowerSDIV24(Op, DAG); - } +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const { + assert(Op.getValueType() == MVT::i64); - return SDValue(Op.getNode(), 0); -} - -SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSREM32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r20, r0, r1 - // umul r20, r20, r1 - // sub r0, r0, r20 - // iadd r0, r0, r10 - // ixor DST, r0, r10 + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - // mov r0, LHS - SDValue r0 = LHS; + SDValue one = DAG.getConstant(1, HalfVT); + SDValue zero = DAG.getConstant(0, HalfVT); - // mov r1, RHS - SDValue r1 = RHS; + //HiLo split + SDValue LHS = Op.getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - // ilt r10, r0, 0 - SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); + SDValue RHS = Op.getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - // ilt r11, r1, 0 - SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + SDValue REM_Hi = zero; + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + const unsigned halfBitWidth = HalfVT.getSizeInBits(); - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + for (unsigned i = 0; i < halfBitWidth; ++i) { + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); + // Get Value of high bit + SDValue HBit; + if (halfBitWidth == 32 && Subtarget->hasBFE()) { + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); + } else { + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + } - // udiv r20, r0, r1 - SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, + DAG.getConstant(halfBitWidth - 1, HalfVT)); + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - // umul r20, r20, r1 - r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - // sub r0, r0, r20 - r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); -SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); -} + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); -SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType(); + // Update REM - if (OVT.getScalarType() == MVT::i64) - return LowerSREM64(Op, DAG); + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - if (OVT.getScalarType() == MVT::i32) - return LowerSREM32(Op, DAG); + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); + } - return SDValue(Op.getNode(), 0); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); } SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, @@ -1526,15 +1645,31 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDLoc DL(Op); EVT VT = Op.getValueType(); + if (VT == MVT::i64) { + SmallVector<SDValue, 2> Results; + LowerUDIVREM64(Op, DAG, Results); + return DAG.getMergeValues(Results, DL); + } + SDValue Num = Op.getOperand(0); SDValue Den = Op.getOperand(1); + if (VT == MVT::i32) { + if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && + DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, false); + } + } + // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - // RCP_LO = umulo(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); + // RCP_LO = mul(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); // RCP_HI = mulhu (RCP, Den) */ SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); @@ -1565,7 +1700,7 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); + SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); // Remainder = Num - Num_S_Remainder SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); @@ -1630,12 +1765,22 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDLoc DL(Op); EVT VT = Op.getValueType(); - SDValue Zero = DAG.getConstant(0, VT); - SDValue NegOne = DAG.getConstant(-1, VT); - SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + if (VT == MVT::i32) { + if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 && + DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, true); + } + } + + SDValue Zero = DAG.getConstant(0, VT); + SDValue NegOne = DAG.getConstant(-1, VT); + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); @@ -1663,6 +1808,20 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, return DAG.getMergeValues(Res, DL); } +// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); + SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); + + return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); +} + SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1705,7 +1864,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { const unsigned ExpBits = 11; // Extract the exponent. - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32, + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, Hi, DAG.getConstant(FractBits - 32, MVT::i32), DAG.getConstant(ExpBits, MVT::i32)); @@ -1796,13 +1955,43 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(1, MVT::i32)); + + SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, + SL, MVT::f64, Hi); + + SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); + + SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + DAG.getConstant(32, MVT::i32)); + + return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); +} + SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue S0 = Op.getOperand(0); - SDLoc DL(Op); - if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) + if (S0.getValueType() != MVT::i64) return SDValue(); + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, false); + + assert(DestVT == MVT::f32); + + SDLoc DL(Op); + // f32 uint_to_fp i64 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, DAG.getConstant(0, MVT::i32)); @@ -1815,16 +2004,62 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); } -SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, - unsigned BitsDiff, - SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); - SDLoc DL(Op); - SDValue Shift = DAG.getConstant(BitsDiff, VT); - // Shift left by 'Shift' bits. - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); - // Signed shift Right by 'Shift' bits. - return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); +SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + + SDValue Src = Op.getOperand(0); + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + SDValue K0 + = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64); + SDValue K1 + = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); + + SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); + + + SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); + + SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, + MVT::i32, FloorMul); + SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); + + SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, false); + + return SDValue(); } SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, @@ -1890,13 +2125,64 @@ template <typename IntTy> static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width) { if (Width + Offset < 32) { - IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); + uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); + IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); return DAG.getConstant(Result, MVT::i32); } return DAG.getConstant(Src0 >> Offset, MVT::i32); } +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast<StoreSDNode>(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + return SDValue(); + + LoadSDNode *LoadVal = cast<LoadSDNode>(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -1929,7 +2215,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, } SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -1945,9 +2231,51 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT_CC: { - return CombineMinMax(N, DAG); + case ISD::SELECT_CC: { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (VT == MVT::f32 || + (VT == MVT::f64 && + Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + SDValue CC = N->getOperand(4); + + return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } + + break; + } + case ISD::SELECT: { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() == ISD::SETCC) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32 || + (VT == MVT::f64 && + Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) { + return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } + + // TODO: Implement min / max Evergreen instructions. + if (VT == MVT::i32 && + Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } } + + break; + } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -1992,41 +2320,47 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); } - if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { if (Signed) { return constantFoldBFE<int32_t>(DAG, - Val->getSExtValue(), + CVal->getSExtValue(), OffsetVal, WidthVal); } return constantFoldBFE<uint32_t>(DAG, - Val->getZExtValue(), + CVal->getZExtValue(), OffsetVal, WidthVal); } - APInt Demanded = APInt::getBitsSet(32, - OffsetVal, - OffsetVal + WidthVal); - if ((OffsetVal + WidthVal) >= 32) { SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); } - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || - TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); + if (BitsFrom.hasOneUse()) { + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, + KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } } break; } + + case ISD::STORE: + return performStoreCombine(N, DCI); } return SDValue(); } @@ -2117,12 +2451,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(MAD) + NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) - NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(FMIN_LEGACY) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) @@ -2132,6 +2473,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RSQ_LEGACY) NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) @@ -2157,6 +2499,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } @@ -2225,17 +2568,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( unsigned BitWidth = 32; uint32_t Width = CWidth->getZExtValue() & 0x1f; - if (Width == 0) { - KnownZero = APInt::getAllOnesValue(BitWidth); - KnownOne = APInt::getNullValue(BitWidth); - return; - } - // FIXME: This could do a lot more. If offset is 0, should be the same as - // sign_extend_inreg implementation, but that involves duplicating it. - if (Opc == AMDGPUISD::BFE_I32) - KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width); - else + if (Opc == AMDGPUISD::BFE_U32) KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); break; diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 98a92ad..36b4ee6 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUISELLOWERING_H -#define AMDGPUISELLOWERING_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H #include "llvm/Target/TargetLowering.h" @@ -43,48 +43,52 @@ private: /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. - SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; - SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, - unsigned BitsDiff, - SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); - /// \brief Helper function that adds Reg to the LiveIn list of the DAG's - /// MachineFunction. - /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; - SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, - SelectionDAG &DAG) const; - /// \brief Split a vector load into multiple scalar loads. - SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const; + virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const; + + /// \brief Split a vector load into a scalar load of each component. + SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector load into 2 loads of half the vector. + SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into a scalar store of each component. + SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; + void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; @@ -138,7 +142,23 @@ public: SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const; + SDValue CombineFMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const; + SDValue CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const; + const char* getTargetNodeName(unsigned Opcode) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, @@ -155,10 +175,16 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; - virtual unsigned ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; }; namespace AMDGPUISD { @@ -174,17 +200,24 @@ enum { DWORDADDR, FRACT, CLAMP, + MAD, // Multiply + add with same result as the separate operations. // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. COS_HW, SIN_HW, - FMAX, + FMAX_LEGACY, SMAX, UMAX, - FMIN, + FMIN_LEGACY, SMIN, UMIN, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, URECIP, DIV_SCALE, DIV_FMAS, @@ -197,6 +230,7 @@ enum { RSQ, RSQ_LEGACY, RSQ_CLAMPED, + LDEXP, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. @@ -232,6 +266,8 @@ enum { /// T2|v.z| | | | /// T3|v.w| | | | BUILD_VERTICAL_VECTOR, + /// Pointer to the start of the shader's constant data. + CONST_DATA_PTR, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, @@ -244,4 +280,4 @@ enum { } // End namespace llvm -#endif // AMDGPUISELLOWERING_H +#endif diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index fef5b8c..a8fc614 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -86,21 +86,6 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // TODO: Implement this function return nullptr; } -bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, - MachineBasicBlock &MBB) const { - while (iter != MBB.end()) { - switch (iter->getOpcode()) { - default: - break; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: - case AMDGPU::BRANCH: - return true; - }; - ++iter; - } - return false; -} void AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -147,7 +132,6 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const } else if (isRegisterStore(*MI)) { int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val); - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); @@ -215,15 +199,30 @@ AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, return 0; } -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const { - assert(Offset2 > Offset1 - && "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 16, - // then schedule together. - // TODO: Make the loads schedule near if it fits in a cacheline - return (NumLoads < 16 && (Offset2 - Offset1) < 16); +bool AMDGPUInstrInfo::enableClusterLoads() const { + return true; +} + +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } bool @@ -320,7 +319,10 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + Offset = MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } @@ -335,7 +337,7 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { } // Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned +// header files, so we need to wrap it in a function that takes unsigned // instead. namespace llvm { namespace AMDGPU { diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index 95dc8c1..da9833d 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -13,10 +13,9 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUINSTRUCTIONINFO_H -#define AMDGPUINSTRUCTIONINFO_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H -#include "AMDGPUInstrInfo.h" #include "AMDGPURegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include <map> @@ -41,8 +40,6 @@ class MachineInstrBuilder; class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { private: const AMDGPURegisterInfo RI; - bool getNextBranchInstr(MachineBasicBlock::iterator &iter, - MachineBasicBlock &MBB) const; virtual void anchor(); protected: const AMDGPUSubtarget &ST; @@ -74,11 +71,6 @@ public: LiveVariables *LV) const override; - virtual void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const = 0; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -101,6 +93,7 @@ protected: MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, MachineInstr *LoadMI) const override; +public: /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexBegin(const MachineFunction &MF) const; @@ -109,7 +102,6 @@ protected: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; -public: bool canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, @@ -120,6 +112,9 @@ public: unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex = nullptr) const override; + + bool enableClusterLoads() const override; + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; @@ -144,7 +139,6 @@ public: // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// - virtual unsigned getIEQOpcode() const = 0; virtual bool isMov(unsigned opcode) const = 0; /// \brief Calculate the "Indirect Address" for the given \p RegIndex and @@ -197,4 +191,4 @@ namespace AMDGPU { #define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) #define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) -#endif // AMDGPUINSTRINFO_H +#endif diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 934d59d..4ee0f2b 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -23,6 +23,10 @@ def AMDGPUTrigPreOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] >; +def AMDGPULdExpOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -34,6 +38,9 @@ def AMDGPUDivScaleOp : SDTypeProfile<2, 3, // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; @@ -49,12 +56,18 @@ def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; -// out = max(a, b) a and b are floats -def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative] +def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; + +// out = max(a, b) a and b are floats, where a nan comparison fails. +// This is not commutative because this gives the second operand: +// x < nan ? x : nan -> nan +// nan < x ? nan : x -> x +def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, + [SDNPAssociative] >; def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; +def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>; // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, @@ -66,12 +79,12 @@ def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; -// out = min(a, b) a and b are floats -def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative] +// out = min(a, b) a and b are floats, where a nan comparison fails. +def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, + [SDNPAssociative] >; -// out = min(a, b) a snd b are signed ints +// out = min(a, b) a and b are signed ints def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; @@ -81,6 +94,37 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; @@ -127,7 +171,7 @@ def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", // MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) // // src0: vec4(src, 0, 0, mask) -// src1: dst - rat offset (aka pointer) in dwords +// src1: dst - rat offset (aka pointer) in dwords def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index b86b781..c215865 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -23,6 +23,8 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio let Pattern = pattern; let Itinerary = NullALU; + let isCodeGenOnly = 1; + let TSFlags{63} = isRegisterLoad; let TSFlags{62} = isRegisterStore; } @@ -34,9 +36,15 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> } +def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; + def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; +let OperandType = "OPERAND_IMMEDIATE" in { + def u32imm : Operand<i32> { let PrintMethod = "printU32ImmOperand"; } @@ -49,6 +57,8 @@ def u8imm : Operand<i8> { let PrintMethod = "printU8ImmOperand"; } +} // End OperandType = "OPERAND_IMMEDIATE" + //===--------------------------------------------------------------------===// // Custom Operands //===--------------------------------------------------------------------===// @@ -125,13 +135,35 @@ def COND_NE : PatLeaf < def COND_NULL : PatLeaf < (cond), - [{return false;}] + [{(void)N; return false;}] >; //===----------------------------------------------------------------------===// // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; +}]>; + +class PrivateLoad <SDPatternOperator op> : PrivateMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class PrivateStore <SDPatternOperator op> : PrivateMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def extloadi8_private : PrivateLoad <extloadi8>; +def sextloadi8_private : PrivateLoad <sextloadi8>; +def extloadi16_private : PrivateLoad <extloadi16>; +def sextloadi16_private : PrivateLoad <sextloadi16>; +def load_private : PrivateLoad <load>; + +def truncstorei8_private : PrivateStore <truncstorei8>; +def truncstorei16_private : PrivateStore <truncstorei16>; +def store_private : PrivateStore <store>; + def global_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isGlobalStore(dyn_cast<StoreSDNode>(N)); @@ -165,6 +197,14 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -193,6 +233,14 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -218,6 +266,11 @@ def az_extloadi32_global : PatFrag<(ops node:$ptr), return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + def az_extloadi32_constant : PatFrag<(ops node:$ptr), (az_extloadi32 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); @@ -233,6 +286,16 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), return isGlobalStore(dyn_cast<StoreSDNode>(N)); }]>; +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + def local_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isLocalStore(dyn_cast<StoreSDNode>(N)); @@ -252,6 +315,17 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() % 8 == 0; +}]>; + +def local_load_aligned8bytes : Aligned8Bytes < + (ops node:$ptr), (local_load node:$ptr) +>; + +def local_store_aligned8bytes : Aligned8Bytes < + (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +>; class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), @@ -277,6 +351,7 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr), return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; + def atomic_cmp_swap_32_local : PatFrag<(ops node:$ptr, node:$cmp, node:$swap), (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ @@ -293,6 +368,45 @@ def atomic_cmp_swap_64_local : AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def flat_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + +class global_binary_atomic_op<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] +>; + +def atomic_swap_global : global_binary_atomic_op<atomic_swap>; +def atomic_add_global : global_binary_atomic_op<atomic_load_add>; +def atomic_and_global : global_binary_atomic_op<atomic_load_and>; +def atomic_max_global : global_binary_atomic_op<atomic_load_max>; +def atomic_min_global : global_binary_atomic_op<atomic_load_min>; +def atomic_or_global : global_binary_atomic_op<atomic_load_or>; +def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; +def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; +def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; +def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; + +//===----------------------------------------------------------------------===// +// Misc Pattern Fragments +//===----------------------------------------------------------------------===// + +def fmad : PatFrag < + (ops node:$src0, node:$src1, node:$src2), + (fadd (fmul node:$src0, node:$src1), node:$src2) +>; class Constants { int TWO_PI = 0x40c90fdb; @@ -412,8 +526,9 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < // BFI_INT patterns -multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> { - +multiclass BFIPatterns <Instruction BFI_INT, + Instruction LoadImm32, + RegisterClass RC64> { // Definition from ISA doc: // (y & x) | (z & ~x) def : Pat < @@ -435,8 +550,8 @@ multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> { def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), - (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (i32 (EXTRACT_SUBREG $src0, sub0)), sub0), + (REG_SEQUENCE RC64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, (BFI_INT (LoadImm32 0x7fffffff), (i32 (EXTRACT_SUBREG $src0, sub1)), (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp index 58916a9..e94bb60 100644 --- a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; #include "AMDGPUGenIntrinsics.inc" #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() : TargetIntrinsicInfo() {} std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h index 5be68a2..4c95b5e 100644 --- a/lib/Target/R600/AMDGPUIntrinsicInfo.h +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef AMDGPU_INTRINSICINFO_H -#define AMDGPU_INTRINSICINFO_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -33,7 +33,7 @@ enum ID { class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { public: - AMDGPUIntrinsicInfo(TargetMachine *tm); + AMDGPUIntrinsicInfo(); std::string getName(unsigned IntrId, Type **Tys = nullptr, unsigned numTys = 0) const override; unsigned lookupName(const char *Name, unsigned Len) const override; @@ -45,4 +45,4 @@ public: } // end namespace llvm -#endif // AMDGPU_INTRINSICINFO_H +#endif diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index d934676..eee9c29 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -13,9 +13,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { - def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index ac82e88..bca027f 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -22,7 +22,9 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectStreamer.h" @@ -77,6 +79,20 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(GV->getName())); + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(Sym, Ctx)); + break; + } + case MachineOperand::MO_TargetIndex: { + assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); + MCOp = MCOperand::CreateExpr(Expr); + break; + } } OutMI.addOperand(MCOp); } @@ -88,7 +104,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { #ifdef _DEBUG StringRef Err; - if (!TM.getInstrInfo()->verifyInstruction(MI, Err)) { + if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) { errs() << "Warning: Illegal instruction detected: " << Err << "\n"; MI->dump(); } @@ -112,8 +128,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { std::string &DisasmLine = DisasmLines.back(); raw_string_ostream DisasmStream(DisasmLine); - AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(), - *TM.getRegisterInfo()); + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), + *TM.getSubtargetImpl()->getInstrInfo(), + *TM.getSubtargetImpl()->getRegisterInfo()); InstPrinter.printInst(&TmpInst, DisasmStream, StringRef()); // Disassemble instruction/operands to hex representation. diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 58fe34d..00d1f1b 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPU_MCINSTLOWER_H -#define AMDGPU_MCINSTLOWER_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H namespace llvm { @@ -45,4 +45,4 @@ public: } // End namespace llvm -#endif //AMDGPU_MCINSTLOWER_H +#endif diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 14171f4..0f3f9e2 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -10,9 +10,11 @@ static const char *const ShaderTypeAttribute = "ShaderType"; void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : - MachineFunctionInfo() { - ShaderType = ShaderType::COMPUTE; - LDSSize = 0; + MachineFunctionInfo(), + ShaderType(ShaderType::COMPUTE), + LDSSize(0), + ScratchSize(0), + IsKernel(true) { AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h index fea0b39..f5e4694 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPUMACHINEFUNCTION_H -#define AMDGPUMACHINEFUNCTION_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" #include <map> @@ -20,15 +20,26 @@ namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { virtual void anchor(); + unsigned ShaderType; + public: AMDGPUMachineFunction(const MachineFunction &MF); - unsigned ShaderType; /// A map to keep track of local memory objects and their offsets within /// the local memory space. std::map<const GlobalValue *, unsigned> LocalMemoryObjects; /// Number of bytes in the LDS that are being used. unsigned LDSSize; + + /// Start of implicit kernel args + unsigned ABIArgOffset; + + unsigned getShaderType() const { + return ShaderType; + } + + unsigned ScratchSize; + bool IsKernel; }; } -#endif // AMDGPUMACHINEFUNCTION_H +#endif diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp index 218750d..b81fef4 100644 --- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -36,11 +36,9 @@ class AMDGPUPromoteAlloca : public FunctionPass, public: AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), LocalMemAvailable(0) { } - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); - virtual const char *getPassName() const { - return "AMDGPU Promote Alloca"; - } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Promote Alloca"; } void visitAlloca(AllocaInst &I); }; @@ -107,14 +105,16 @@ static VectorType *arrayTypeToVecType(const Type *ArrayTy) { ArrayTy->getArrayNumElements()); } -static Value* calculateVectorIndex(Value *Ptr, - std::map<GetElementPtrInst*, Value*> GEPIdx) { +static Value * +calculateVectorIndex(Value *Ptr, + const std::map<GetElementPtrInst *, Value *> &GEPIdx) { if (isa<AllocaInst>(Ptr)) return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); - return GEPIdx[GEP]; + auto I = GEPIdx.find(GEP); + return I == GEPIdx.end() ? nullptr : I->second; } static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { @@ -234,7 +234,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { return true; } -static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { +static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + bool Success = true; for (User *User : Val->users()) { if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) continue; @@ -242,11 +243,20 @@ static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { WorkList.push_back(User); continue; } + + // FIXME: Correctly handle ptrtoint instructions. + Instruction *UseInst = dyn_cast<Instruction>(User); + if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + return false; + if (!User->getType()->isPointerTy()) continue; + WorkList.push_back(User); - collectUsesWithPtrTypes(User, WorkList); + + Success &= collectUsesWithPtrTypes(User, WorkList); } + return Success; } void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { @@ -274,6 +284,13 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { return; } + std::vector<Value*> WorkList; + + if (!collectUsesWithPtrTypes(&I, WorkList)) { + DEBUG(dbgs() << " Do not know how to convert all uses\n"); + return; + } + DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; @@ -320,10 +337,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { I.replaceAllUsesWith(Offset); I.eraseFromParent(); - std::vector<Value*> WorkList; - - collectUsesWithPtrTypes(Offset, WorkList); - for (std::vector<Value*>::iterator i = WorkList.begin(), e = WorkList.end(); i != e; ++i) { Value *V = *i; @@ -331,6 +344,13 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { if (!Call) { Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + // The operand's value should be corrected on its own. + if (isa<AddrSpaceCastInst>(V)) + continue; + + // FIXME: It doesn't really make sense to try to do this for all + // instructions. V->mutateType(NewTy); continue; } diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 4731595..f27576a 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUREGISTERINFO_H -#define AMDGPUREGISTERINFO_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H #include "llvm/ADT/BitVector.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -62,4 +62,4 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { } // End namespace llvm -#endif // AMDIDSAREGISTERINFO_H +#endif diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index b83c290..9d09a19 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -13,8 +13,14 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" #include "R600InstrInfo.h" +#include "R600MachineScheduler.h" #include "SIInstrInfo.h" +#include "SIISelLowering.h" +#include "llvm/ADT/SmallString.h" + +#include "llvm/ADT/SmallString.h" using namespace llvm; @@ -25,29 +31,66 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) : - AMDGPUGenSubtargetInfo(TT, GPU, FS), - DevName(GPU), - Is64bit(false), - DumpCode(false), - R600ALUInst(false), - HasVertexCache(false), - TexVTXClauseSize(0), - Gen(AMDGPUSubtarget::R600), - FP64(false), - CaymanISA(false), - EnableIRStructurizer(true), - EnableIfCvt(true), - WavefrontSize(0), - CFALUBug(false), - LocalMemorySize(0), - InstrItins(getInstrItineraryForCPU(GPU)) { - ParseSubtargetFeatures(GPU, FS); +static std::string computeDataLayout(const AMDGPUSubtarget &ST) { + std::string Ret = "e-p:32:32"; + + if (ST.is64bit()) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + +AMDGPUSubtarget & +AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) { + // Determine default and user-specified characteristics + // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be + // enabled, but some instructions do not respect them and they run at the + // double precision rate, so don't enable by default. + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + FullFS += FS; + + ParseSubtargetFeatures(GPU, FullFS); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP32Denormals = false; + FP64Denormals = false; + } + return *this; +} +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, + TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), + DumpCode(false), R600ALUInst(false), HasVertexCache(false), + TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), + FP64Denormals(false), FP32Denormals(false), CaymanISA(false), + FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), + FrameLowering(TargetFrameLowering::StackGrowsUp, + 64 * 16, // Maximum stack alignment (long16) + 0), + InstrItins(getInstrItineraryForCPU(GPU)) { if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); + TLInfo.reset(new R600TargetLowering(TM)); } else { InstrInfo.reset(new SIInstrInfo(*this)); + TLInfo.reset(new SITargetLowering(TM)); } } diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 0c388b3..f71d80a 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -12,10 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUSUBTARGET_H -#define AMDGPUSUBTARGET_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H #include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "llvm/IR/DataLayout.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -23,14 +28,10 @@ #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" -#define MAX_CB_SIZE (1 << 16) - namespace llvm { class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { - std::unique_ptr<AMDGPUInstrInfo> InstrInfo; - public: enum Generation { R600 = 0, @@ -50,24 +51,43 @@ private: short TexVTXClauseSize; Generation Gen; bool FP64; + bool FP64Denormals; + bool FP32Denormals; bool CaymanISA; + bool FlatAddressSpace; bool EnableIRStructurizer; + bool EnablePromoteAlloca; bool EnableIfCvt; + bool EnableLoadStoreOpt; unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; + const DataLayout DL; + AMDGPUFrameLowering FrameLowering; + std::unique_ptr<AMDGPUTargetLowering> TLInfo; + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; public: - AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); + AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM); + AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS); - const AMDGPUInstrInfo *getInstrInfo() const { + const AMDGPUFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const AMDGPUInstrInfo *getInstrInfo() const override { return InstrInfo.get(); } - - const InstrItineraryData &getInstrItineraryData() const { - return InstrItins; + const AMDGPURegisterInfo *getRegisterInfo() const override { + return &InstrInfo->getRegisterInfo(); + } + AMDGPUTargetLowering *getTargetLowering() const override { + return TLInfo.get(); + } + const DataLayout *getDataLayout() const override { return &DL; } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; } void ParseSubtargetFeatures(StringRef CPU, StringRef FS); @@ -81,7 +101,7 @@ public: } short getTexVTXClauseSize() const { - return TexVTXClauseSize; + return TexVTXClauseSize; } Generation getGeneration() const { @@ -96,6 +116,18 @@ public: return CaymanISA; } + bool hasFP32Denormals() const { + return FP32Denormals; + } + + bool hasFP64Denormals() const { + return FP64Denormals; + } + + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } @@ -112,8 +144,10 @@ public: if (Size == 32) return (getGeneration() >= EVERGREEN); - assert(Size == 64); - return (getGeneration() >= SOUTHERN_ISLANDS); + if (Size == 64) + return (getGeneration() >= SOUTHERN_ISLANDS); + + return false; } bool hasMulU24() const { @@ -125,14 +159,30 @@ public: hasCaymanISA()); } + bool hasFFBL() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const { + return (getGeneration() >= EVERGREEN); + } + bool IsIRStructurizerEnabled() const { return EnableIRStructurizer; } + bool isPromoteAllocaEnabled() const { + return EnablePromoteAlloca; + } + bool isIfCvtEnabled() const { return EnableIfCvt; } + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + unsigned getWavefrontSize() const { return WavefrontSize; } @@ -171,4 +221,4 @@ public: } // End namespace llvm -#endif // AMDGPUSUBTARGET_H +#endif diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 8aab944..b2cd988 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "llvm/Analysis/Passes.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Verifier.h" @@ -33,7 +34,6 @@ #include "llvm/Transforms/Scalar.h" #include <llvm/CodeGen/Passes.h> - using namespace llvm; extern "C" void LLVMInitializeR600Target() { @@ -49,46 +49,20 @@ static MachineSchedRegistry SchedCustomRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); -static std::string computeDataLayout(const AMDGPUSubtarget &ST) { - std::string Ret = "e-p:32:32"; - - if (ST.is64bit()) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; - } - - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; - - return Ret; -} - AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - TargetOptions Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OptLevel -) -: - LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), - Subtarget(TT, CPU, FS), - Layout(computeDataLayout(Subtarget)), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16 // Maximum stack alignment (long16) - , 0), - IntrinsicInfo(this), - InstrItins(&Subtarget.getInstrItineraryData()) { - // TLInfo uses InstrInfo so it must be initialized after. - if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - TLInfo.reset(new R600TargetLowering(*this)); - } else { - TLInfo.reset(new SITargetLowering(*this)); - } + StringRef CPU, StringRef FS, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OptLevel) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), + TLOF(new TargetLoweringObjectFileELF()), + Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() { + delete TLOF; } namespace { @@ -109,7 +83,8 @@ public: return nullptr; } - virtual void addCodeGenPrepare(); + void addIRPasses() override; + void addCodeGenPrepare() override; bool addPreISel() override; bool addInstSelector() override; bool addPreRegAlloc() override; @@ -135,10 +110,26 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { PM.add(createAMDGPUTargetTransformInfoPass(this)); } +void AMDGPUPassConfig::addIRPasses() { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + TargetPassConfig::addIRPasses(); +} + void AMDGPUPassConfig::addCodeGenPrepare() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); + if (ST.isPromoteAllocaEnabled()) { + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + } + TargetPassConfig::addCodeGenPrepare(); } @@ -159,8 +150,15 @@ AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); - addPass(createSILowerI1CopiesPass()); + + if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixSGPRCopiesPass(*TM)); + } + return false; } @@ -170,12 +168,18 @@ bool AMDGPUPassConfig::addPreRegAlloc() { if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createR600VectorRegMerger(*TM)); } else { - addPass(createSIFixSGPRCopiesPass(*TM)); - // SIFixSGPRCopies can generate a lot of duplicate instructions, - // so we need to run MachineCSE afterwards. - addPass(&MachineCSEID); - initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); - insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID); + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + } + + addPass(createSIShrinkInstructionsPass()); + addPass(createSIFixSGPRLiveRangesPass()); } return false; } @@ -183,6 +187,7 @@ bool AMDGPUPassConfig::addPreRegAlloc() { bool AMDGPUPassConfig::addPostRegAlloc() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createSIShrinkInstructionsPass()); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createSIInsertWaits(*TM)); } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 3bb15be..1b3dbce 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPU_TARGET_MACHINE_H -#define AMDGPU_TARGET_MACHINE_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" @@ -25,47 +25,30 @@ namespace llvm { class AMDGPUTargetMachine : public LLVMTargetMachine { - + TargetLoweringObjectFile *TLOF; AMDGPUSubtarget Subtarget; - const DataLayout Layout; - AMDGPUFrameLowering FrameLowering; AMDGPUIntrinsicInfo IntrinsicInfo; - std::unique_ptr<AMDGPUTargetLowering> TLInfo; - const InstrItineraryData *InstrItins; public: AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - const AMDGPUInstrInfo *getInstrInfo() const override { - return getSubtargetImpl()->getInstrInfo(); - } const AMDGPUSubtarget *getSubtargetImpl() const override { return &Subtarget; } - const AMDGPURegisterInfo *getRegisterInfo() const override { - return &getInstrInfo()->getRegisterInfo(); - } - AMDGPUTargetLowering *getTargetLowering() const override { - return TLInfo.get(); - } - const InstrItineraryData *getInstrItineraryData() const override { - return InstrItins; + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; } - const DataLayout *getDataLayout() const override { return &Layout; } TargetPassConfig *createPassConfig(PassManagerBase &PM) override; /// \brief Register R600 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF; + } }; } // End namespace llvm -#endif // AMDGPU_TARGET_MACHINE_H +#endif diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index ea78f43..e7bc006 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -52,7 +52,7 @@ public: AMDGPUTTI(const AMDGPUTargetMachine *TM) : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + TLI(TM->getSubtargetImpl()->getTargetLowering()) { initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); } @@ -74,10 +74,14 @@ public: bool hasBranchDivergence() const override; - void getUnrollingPreferences(Loop *L, + void getUnrollingPreferences(const Function *F, Loop *L, UnrollingPreferences &UP) const override; - /// @} + PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override; + + unsigned getNumberOfRegisters(bool Vector) const override; + unsigned getRegisterBitWidth(bool Vector) const override; + unsigned getMaxInterleaveFactor() const override; }; } // end anonymous namespace @@ -93,16 +97,20 @@ llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { bool AMDGPUTTI::hasBranchDivergence() const { return true; } -void AMDGPUTTI::getUnrollingPreferences(Loop *L, +void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, UnrollingPreferences &UP) const { - for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end(); - BI != BE; ++BI) { - BasicBlock *BB = *BI; - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { - const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I); - if (!GEP) + UP.Threshold = 300; // Twice the default. + UP.Count = UINT_MAX; + UP.Partial = true; + + // TODO: Do we want runtime unrolling? + + for (const BasicBlock *BB : L->getBlocks()) { + for (const Instruction &I : *BB) { + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); + if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) continue; + const Value *Ptr = GEP->getPointerOperand(); const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr)); if (Alloca) { @@ -116,8 +124,34 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L, // // Don't use the maximum allowed value here as it will make some // programs way too big. - UP.Threshold = 500; + UP.Threshold = 800; } } } } + +AMDGPUTTI::PopcntSupportKind +AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software; +} + +unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { + if (Vec) + return 0; + + // Number of VGPRs on SI. + if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 256; + + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned AMDGPUTTI::getRegisterBitWidth(bool) const { + return 32; +} + +unsigned AMDGPUTTI::getMaxInterleaveFactor() const { + // Semi-arbitrary large amount. + return 64; +} diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index f3a0391..ee6e8ec 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -11,6 +11,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "R600InstrInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallVector.h" @@ -160,7 +161,7 @@ public: bool prepare(); bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); OrderedBlks.clear(); @@ -337,7 +338,7 @@ protected: void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); - /// This is work around solution for findNearestCommonDominator not avaiable + /// This is work around solution for findNearestCommonDominator not available /// to post dom a proper fix should go to Dominators.h. MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2); diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp new file mode 100644 index 0000000..7ad815d --- /dev/null +++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp @@ -0,0 +1,320 @@ +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class AMDGPUAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + MCAsmParser &Parser; + + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "AMDGPUGenAsmMatcher.inc" + + /// } + +public: + AMDGPUAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &_MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + bool ParseDirective(AsmToken DirectiveID) override; + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + bool parseCnt(int64_t &IntVal); + OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); +}; + +class AMDGPUOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate + } Kind; + +public: + AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + int64_t Val; + }; + + union { + TokOp Tok; + ImmOp Imm; + }; + + void addImmOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::CreateImm(getImm())); + } + void addRegOperands(MCInst &Inst, unsigned N) const { + llvm_unreachable("addRegOperands"); + } + StringRef getToken() const { + return StringRef(Tok.Data, Tok.Length); + } + bool isToken() const override { + return Kind == Token; + } + + bool isImm() const override { + return Kind == Immediate; + } + + int64_t getImm() const { + return Imm.Val; + } + + bool isReg() const override { + return false; + } + + unsigned getReg() const override { + return 0; + } + + bool isMem() const override { + return false; + } + + SMLoc getStartLoc() const override { + return SMLoc(); + } + + SMLoc getEndLoc() const override { + return SMLoc(); + } + + void print(raw_ostream &OS) const override { } + + static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val) { + auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); + Op->Imm.Val = Val; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc) { + auto Res = llvm::make_unique<AMDGPUOperand>(Token); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + return Res; + } + + bool isSWaitCnt() const; +}; + +} + +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + return true; +} + + +bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + default: break; + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction use requires an option to be enabled"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_InvalidOperand: { + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + } + return Error(IDLoc, "invalid operand for instruction"); + } + } + llvm_unreachable("Implement any new match types added!"); +} + +bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { + return true; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { + + // Try to parse with a custom parser + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + + // If we successfully parsed the operand or if there as an error parsing, + // we are done. + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail) + return ResTy; + + switch(getLexer().getKind()) { + case AsmToken::Integer: { + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal)); + return MatchOperand_Success; + } + default: + return MatchOperand_NoMatch; + } +} + +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + // Add the instruction mnemonic + Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); + + if (getLexer().is(AsmToken::EndOfStatement)) + return false; + + AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + switch (Res) { + case MatchOperand_Success: return false; + case MatchOperand_ParseFail: return Error(NameLoc, + "Failed parsing operand"); + case MatchOperand_NoMatch: return Error(NameLoc, "Not a valid operand"); + } + return true; +} + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + +bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { + StringRef CntName = Parser.getTok().getString(); + int64_t CntVal; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return true; + + if (getParser().parseAbsoluteExpression(CntVal)) + return true; + + if (getLexer().isNot(AsmToken::RParen)) + return true; + + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + int CntShift; + int CntMask; + + if (CntName == "vmcnt") { + CntMask = 0xf; + CntShift = 0; + } else if (CntName == "expcnt") { + CntMask = 0x7; + CntShift = 4; + } else if (CntName == "lgkmcnt") { + CntMask = 0x7; + CntShift = 8; + } else { + return true; + } + + IntVal &= ~(CntMask << CntShift); + IntVal |= (CntVal << CntShift); + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { + // Disable all counters by default. + // vmcnt [3:0] + // expcnt [6:4] + // lgkmcnt [10:8] + int64_t CntVal = 0x77f; + + switch(getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(CntVal)) + return MatchOperand_ParseFail; + break; + + case AsmToken::Identifier: + do { + if (parseCnt(CntVal)) + return MatchOperand_ParseFail; + } while(getLexer().isNot(AsmToken::EndOfStatement)); + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(CntVal)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} + +/// Force static initialization. +extern "C" void LLVMInitializeR600AsmParser() { + RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "AMDGPUGenAsmMatcher.inc" + diff --git a/lib/Target/R600/AsmParser/CMakeLists.txt b/lib/Target/R600/AsmParser/CMakeLists.txt new file mode 100644 index 0000000..1b42af7 --- /dev/null +++ b/lib/Target/R600/AsmParser/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMR600AsmParser + AMDGPUAsmParser.cpp + ) diff --git a/lib/Target/R600/AsmParser/LLVMBuild.txt b/lib/Target/R600/AsmParser/LLVMBuild.txt new file mode 100644 index 0000000..940e4ce --- /dev/null +++ b/lib/Target/R600/AsmParser/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/R600/AsmParser/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = R600AsmParser +parent = R600 +required_libraries = MC MCParser R600Desc R600Info Support +add_to_library_groups = R600 diff --git a/lib/Target/R600/AsmParser/Makefile b/lib/Target/R600/AsmParser/Makefile new file mode 100644 index 0000000..e6689b5 --- /dev/null +++ b/lib/Target/R600/AsmParser/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600AsmParser + +# Hack: we need to include 'main' R600 target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 4d16082..ed0a216 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -6,13 +6,15 @@ tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) -tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter) +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(R600CodeGen AMDILCFGStructurizer.cpp + AMDGPUAlwaysInlinePass.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp AMDGPUIntrinsicInfo.cpp @@ -44,13 +46,16 @@ add_llvm_target(R600CodeGen SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp + SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIRegisterInfo.cpp + SIShrinkInstructions.cpp SITypeRewriter.cpp ) +add_subdirectory(AsmParser) add_subdirectory(InstPrinter) add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td index 2630345..58b5ce2 100644 --- a/lib/Target/R600/CaymanInstructions.td +++ b/lib/Target/R600/CaymanInstructions.td @@ -46,6 +46,8 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 +defm : RsqPat<RECIPSQRT_IEEE_cm, f32>; + def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index dcb7e98..f24f76b 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -69,6 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; +defm : RsqPat<RECIPSQRT_IEEE_eg, f32>; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; @@ -256,6 +257,12 @@ def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, let Predicates = [isEGorCayman] in { +// Should be predicated on FeatureFP64 +// def FMA_64 : R600_3OP < +// 0xA, "FMA_64", +// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] +// >; + // BFE_UINT - bit_extract, an optimization for mask and shift // Src0 = Input // Src1 = Offset @@ -295,7 +302,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)), def : Pat<(i32 (sext_inreg i32:$src, i16)), (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; -defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>; +defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>; def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], @@ -312,6 +319,7 @@ def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; +def FMA_eg : FMA_Common<0x7>; def ASHR_eg : ASHR_Common<0x15>; def LSHR_eg : LSHR_Common<0x16>; def LSHL_eg : LSHL_Common<0x17>; @@ -328,6 +336,9 @@ defm CUBE_eg : CUBE_Common<0xC0>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; + let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; } @@ -463,21 +474,47 @@ class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : let DisableEncoding = "$dst"; } -class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> : +class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : R600_LDS < - lds_op, - (outs), + lds_op, outs, (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), - " "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", + " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", pattern> { + + field string BaseOp; + + let LDS_1A1D = 0; let LDS_1A2D = 1; } +class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; +def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; +def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; +def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; +def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; +def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; +def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; +def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; +def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; +def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] >; @@ -493,6 +530,33 @@ def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] >; +def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", + [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] +>; +def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", + [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] +>; +def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", + [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", + [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", + [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", + [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", + [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] +>; +def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", + [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] +>; +def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", + [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] +>; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] >; @@ -526,7 +590,7 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; -def : FROUNDPat <CNDGE_eg>; +def : FROUNDPat <CNDGE_eg, CNDGT_eg>; def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 0927040..64fe726 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -9,6 +9,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstPrinter.h" +#include "SIDefines.h" + #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -40,6 +42,81 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); } +void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " offen"; +} + +void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " idxen"; +} + +void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " addr64"; +} + +void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset:"; + printU16ImmOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " glc"; +} + +void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " slc"; +} + +void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " tfe"; +} + void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { switch (reg) { case AMDGPU::VCC: @@ -54,6 +131,27 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { case AMDGPU::M0: O << "m0"; return; + case AMDGPU::FLAT_SCR: + O << "flat_scratch"; + return; + case AMDGPU::VCC_LO: + O << "vcc_lo"; + return; + case AMDGPU::VCC_HI: + O << "vcc_hi"; + return; + case AMDGPU::EXEC_LO: + O << "exec_lo"; + return; + case AMDGPU::EXEC_HI: + O << "exec_hi"; + return; + case AMDGPU::FLAT_SCR_LO: + O << "flat_scratch_lo"; + return; + case AMDGPU::FLAT_SCR_HI: + O << "flat_scratch_hi"; + return; default: break; } @@ -117,19 +215,27 @@ void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) { return; } - if (Imm == FloatToBits(1.0f) || - Imm == FloatToBits(-1.0f) || - Imm == FloatToBits(0.5f) || - Imm == FloatToBits(-0.5f) || - Imm == FloatToBits(2.0f) || - Imm == FloatToBits(-2.0f) || - Imm == FloatToBits(4.0f) || - Imm == FloatToBits(-4.0f)) { - O << BitsToFloat(Imm); - return; + if (Imm == FloatToBits(0.0f)) + O << "0.0"; + else if (Imm == FloatToBits(1.0f)) + O << "1.0"; + else if (Imm == FloatToBits(-1.0f)) + O << "-1.0"; + else if (Imm == FloatToBits(0.5f)) + O << "0.5"; + else if (Imm == FloatToBits(-0.5f)) + O << "-0.5"; + else if (Imm == FloatToBits(2.0f)) + O << "2.0"; + else if (Imm == FloatToBits(-2.0f)) + O << "-2.0"; + else if (Imm == FloatToBits(4.0f)) + O << "4.0"; + else if (Imm == FloatToBits(-4.0f)) + O << "-4.0"; + else { + O << formatHex(static_cast<uint64_t>(Imm)); } - - O << formatHex(static_cast<uint64_t>(Imm)); } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -149,25 +255,30 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } else if (Op.isImm()) { printImmediate(Op.getImm(), O); } else if (Op.isFPImm()) { - O << Op.getFPImm(); + + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else + printImmediate(FloatToBits(Op.getFPImm()), O); } else if (Op.isExpr()) { const MCExpr *Exp = Op.getExpr(); Exp->print(O); } else { - assert(!"unknown operand type in printOperand"); + llvm_unreachable("unknown operand type in printOperand"); } } void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & 0x1) - O << "-"; - if (InputModifiers & 0x2) - O << "|"; + if (InputModifiers & SISrcMods::NEG) + O << '-'; + if (InputModifiers & SISrcMods::ABS) + O << '|'; printOperand(MI, OpNo + 1, O); - if (InputModifiers & 0x2) - O << "|"; + if (InputModifiers & SISrcMods::ABS) + O << '|'; } void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, @@ -181,7 +292,7 @@ void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, } else if (Imm == 0) { O << "P10"; } else { - assert(!"Invalid interpolation parameter slot"); + llvm_unreachable("Invalid interpolation parameter slot"); } } @@ -214,6 +325,23 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, printIfSet(MI, OpNo, O, "_SAT"); } +void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " clamp"; +} + +void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Imm = MI->getOperand(OpNo).getImm(); + if (Imm == SIOutMods::MUL2) + O << " mul:2"; + else if (Imm == SIOutMods::MUL4) + O << " mul:4"; + else if (Imm == SIOutMods::DIV2) + O << " div:2"; +} + void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { int32_t Imm = MI->getOperand(OpNo).getImm(); @@ -281,7 +409,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, sel -= 512; int cb = sel >> 12; sel &= 4095; - O << cb << "[" << sel << "]"; + O << cb << '[' << sel << ']'; } else if (sel >= 448) { sel -= 448; O << sel; @@ -290,7 +418,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, } if (sel >= 0) - O << "." << chans[chan]; + O << '.' << chans[chan]; } void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, @@ -323,25 +451,25 @@ void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, unsigned Sel = MI->getOperand(OpNo).getImm(); switch (Sel) { case 0: - O << "X"; + O << 'X'; break; case 1: - O << "Y"; + O << 'Y'; break; case 2: - O << "Z"; + O << 'Z'; break; case 3: - O << "W"; + O << 'W'; break; case 4: - O << "0"; + O << '0'; break; case 5: - O << "1"; + O << '1'; break; case 7: - O << "_"; + O << '_'; break; default: break; @@ -353,10 +481,10 @@ void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, unsigned CT = MI->getOperand(OpNo).getImm(); switch (CT) { case 0: - O << "U"; + O << 'U'; break; case 1: - O << "N"; + O << 'N'; break; default: break; @@ -368,10 +496,10 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, int KCacheMode = MI->getOperand(OpNo).getImm(); if (KCacheMode > 0) { int KCacheBank = MI->getOperand(OpNo - 2).getImm(); - O << "CB" << KCacheBank <<":"; + O << "CB" << KCacheBank << ':'; int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); - int LineSize = (KCacheMode == 1)?16:32; - O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize; + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; } } @@ -415,12 +543,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, unsigned Vmcnt = SImm16 & 0xF; unsigned Expcnt = (SImm16 >> 4) & 0xF; unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; - if (Vmcnt != 0xF) - O << "vmcnt(" << Vmcnt << ") "; - if (Expcnt != 0x7) - O << "expcnt(" << Expcnt << ") "; - if (Lgkmcnt != 0x7) - O << "lgkmcnt(" << Lgkmcnt << ")"; + + bool NeedSpace = false; + + if (Vmcnt != 0xF) { + O << "vmcnt(" << Vmcnt << ')'; + NeedSpace = true; + } + + if (Expcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "expcnt(" << Expcnt << ')'; + NeedSpace = true; + } + + if (Lgkmcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "lgkmcnt(" << Lgkmcnt << ')'; + } } #include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 6ca7170..4c06ac0 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPUINSTPRINTER_H -#define AMDGPUINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" @@ -34,7 +34,19 @@ public: private: void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printImmediate(uint32_t Imm, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -45,6 +57,8 @@ private: StringRef Asm, StringRef Default = ""); static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -65,4 +79,4 @@ private: } // End namespace llvm -#endif // AMDGPUINSTRPRINTER_H +#endif diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt index 408ed75..f3f254f 100644 --- a/lib/Target/R600/LLVMBuild.txt +++ b/lib/Target/R600/LLVMBuild.txt @@ -16,17 +16,18 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = InstPrinter MCTargetDesc TargetInfo +subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo [component_0] type = TargetGroup name = R600 parent = Target +has_asmparser = 1 has_asmprinter = 1 [component_1] type = Library name = R600CodeGen parent = R600 -required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC R600AsmParser R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index 489cec7..5fb311b 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -9,9 +9,11 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/TargetRegistry.h" @@ -43,7 +45,7 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend() {} - unsigned getNumFixupKinds() const override { return 0; }; + unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -55,9 +57,9 @@ public: assert(!"Not implemented"); } bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } - bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { - return true; - } + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; }; } //End anonymous namespace @@ -73,9 +75,50 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { - uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - assert(Fixup.getKind() == FK_PCRel_4); - *Dst = (Value - 4) / 4; + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("Unknown fixup kind"); + case AMDGPU::fixup_si_sopp_br: { + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + *Dst = (Value - 4) / 4; + break; + } + + case AMDGPU::fixup_si_rodata: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + *Dst = Value; + break; + } + + case AMDGPU::fixup_si_end_of_text: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + // The value points to the last instruction in the text section, so we + // need to add 4 bytes to get to the start of the constants. + *Dst = Value + 4; + break; + } + } +} + +const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( + MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_si_rodata", 0, 32, 0 }, + { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + return Infos[Kind - FirstTargetFixupKind]; +} + +bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + for (unsigned i = 0; i < Count; ++i) + OW->Write8(0); + + return true; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 53b0e85..5fb94d5 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -10,6 +10,7 @@ #include "AMDGPUMCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" using namespace llvm; @@ -21,7 +22,7 @@ public: protected: unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override { - llvm_unreachable("Not implemented"); + return Fixup.getKind(); } }; diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h new file mode 100644 index 0000000..01021d6 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h @@ -0,0 +1,34 @@ +//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AMDGPU { +enum Fixups { + /// 16-bit PC relative fixup for SOPP branch instructions. + fixup_si_sopp_br = FirstTargetFixupKind, + + /// fixup for global addresses with constant initializers + fixup_si_rodata, + + /// fixup for offset from instruction to end of text section + fixup_si_end_of_text, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 78bbe0a..3c2b889 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -11,21 +11,14 @@ #include "AMDGPUMCAsmInfo.h" using namespace llvm; -AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() { +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// - HasSubsectionsViaSymbols = true; - HasMachoZeroFillDirective = false; - HasMachoTBSSDirective = false; - HasStaticCtorDtorReferenceInStaticMode = false; - LinkerRequiresNonEmptyDwarfLines = true; MaxInstLength = 16; SeparatorString = "\n"; CommentString = ";"; - LabelSuffix = ":"; InlineAsmStart = ";#ASMSTART"; InlineAsmEnd = ";#ASMEND"; - AssemblerDialect = 0; //===--- Data Emission Directives -------------------------------------===// ZeroDirective = ".zero"; @@ -35,28 +28,15 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() { Data16bitsDirective = ".short\t"; Data32bitsDirective = ".long\t"; Data64bitsDirective = ".quad\t"; - GPRel32Directive = nullptr; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; - //===--- Alignment Information ----------------------------------------===// - AlignmentIsInBytes = true; - TextAlignFillValue = 0; - //===--- Global Variable Emission Directives --------------------------===// - GlobalDirective = ".global"; - HasSetDirective = false; HasAggressiveSymbolFolding = true; COMMDirectiveAlignmentIsInBytes = false; HasDotTypeDotSizeDirective = false; HasNoDeadStrip = true; WeakRefDirective = ".weakref\t"; //===--- Dwarf Emission Directives -----------------------------------===// - HasLEB128 = true; SupportsDebugInformation = true; } - -const MCSection* -AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const { - return nullptr; -} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h index 59aebec..8f75c76 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -1,4 +1,4 @@ -//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===// +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,18 +11,22 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUMCASMINFO_H -#define AMDGPUMCASMINFO_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { class StringRef; -class AMDGPUMCAsmInfo : public MCAsmInfo { +// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, +// you will need to make sure your new class sets PrivateGlobalPrefix to +// a prefix that won't appeary in a fuction name. The default value +// for PrivateGlobalPrefix is 'L', so it will consider any function starting +// with 'L' as a local symbol. +class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(StringRef &TT); - const MCSection* getNonexecutableStackSection(MCContext &CTX) const override; }; } // namespace llvm -#endif // AMDGPUMCASMINFO_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h index 6a5cd67..c957427 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUCODEEMITTER_H -#define AMDGPUCODEEMITTER_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H #include "llvm/MC/MCCodeEmitter.h" #include "llvm/Support/raw_ostream.h" @@ -37,8 +37,14 @@ public: const MCSubtargetInfo &STI) const { return 0; } + + virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } }; } // End namespace llvm -#endif // AMDGPUCODEEMITTER_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 38a2956..8731055 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -84,12 +84,9 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, - MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, - bool RelaxAll, - bool NoExecStack) { - return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false); + raw_ostream &_OS, MCCodeEmitter *_Emitter, + const MCSubtargetInfo &STI, bool RelaxAll) { + return createELFStreamer(Ctx, MAB, _OS, _Emitter, false); } extern "C" void LLVMInitializeR600TargetMC() { diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index f6b3376..c019766 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// // -#ifndef AMDGPUMCTARGETDESC_H -#define AMDGPUMCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H #include "llvm/ADT/StringRef.h" @@ -55,4 +55,4 @@ MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS); #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" -#endif // AMDGPUMCTARGETDESC_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index ee02111..999fd0d 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -13,8 +13,11 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" +#include "SIDefines.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" @@ -39,6 +42,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; + MCContext &Ctx; /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -49,7 +53,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) - : MCII(mcii), MRI(mri) { } + : MCII(mcii), MRI(mri), Ctx(ctx) { } ~SIMCCodeEmitter() { } @@ -62,6 +66,12 @@ public: uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + + /// \brief Use a fixup to encode the simm16 field for SOPP branch + /// instructions. + unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // End anonymous namespace @@ -75,12 +85,13 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const { - unsigned RegClass = Desc.OpInfo[OpNo].RegClass; return (AMDGPU::SSrc_32RegClassID == RegClass) || (AMDGPU::SSrc_64RegClassID == RegClass) || (AMDGPU::VSrc_32RegClassID == RegClass) || - (AMDGPU::VSrc_64RegClassID == RegClass); + (AMDGPU::VSrc_64RegClassID == RegClass) || + (AMDGPU::VCSrc_32RegClassID == RegClass) || + (AMDGPU::VCSrc_64RegClassID == RegClass); } uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { @@ -90,6 +101,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { Imm.I = MO.getImm(); else if (MO.isFPImm()) Imm.F = MO.getFPImm(); + else if (MO.isExpr()) + return 255; else return ~0; @@ -157,8 +170,13 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, IntFloatUnion Imm; if (Op.isImm()) Imm.I = Op.getImm(); - else + else if (Op.isFPImm()) Imm.F = Op.getFPImm(); + else { + assert(Op.isExpr()); + // This will be replaced with a fixup value. + Imm.I = 0; + } for (unsigned j = 0; j < 4; j++) { OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff)); @@ -169,6 +187,21 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, } } +unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + return getMachineOpValue(MI, MO, Fixups, STI); +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, @@ -177,10 +210,19 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, return MRI.getEncodingValue(MO.getReg()); if (MO.isExpr()) { - const MCExpr *Expr = MO.getExpr(); - MCFixupKind Kind = MCFixupKind(FK_PCRel_4); - Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); - return 0; + const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); + MCFixupKind Kind; + const MCSymbol *Sym = + Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + + if (&Expr->getSymbol() == Sym) { + // Add the offset to the beginning of the constant values. + Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; + } else { + // This is used for constant data stored in .rodata. + Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; + } + Fixups.push_back(MCFixup::Create(4, Expr, Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile index 1b3ebbe..64a7c8c 100644 --- a/lib/Target/R600/Makefile +++ b/lib/Target/R600/Makefile @@ -16,8 +16,8 @@ BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ - AMDGPUGenAsmWriter.inc + AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc -DIRS = InstPrinter TargetInfo MCTargetDesc +DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp index 92bf0df..f07be00 100644 --- a/lib/Target/R600/R600ClauseMergePass.cpp +++ b/lib/Target/R600/R600ClauseMergePass.cpp @@ -18,6 +18,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -167,7 +168,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, } bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index d98a6db..edaf278 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -336,7 +336,7 @@ private: getHWInstrDesc(IsTex?CF_TC:CF_VC)) .addImm(0) // ADDR .addImm(AluInstCount - 1); // COUNT - return ClauseFile(MIb, ClauseContent); + return ClauseFile(MIb, std::move(ClauseContent)); } void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { @@ -426,7 +426,7 @@ private: } assert(ClauseContent.size() < 128 && "ALU clause is too big"); ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); - return ClauseFile(ClauseHead, ClauseContent); + return ClauseFile(ClauseHead, std::move(ClauseContent)); } void @@ -459,11 +459,9 @@ private: void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); } - void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) - const { - for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); - It != E; ++It) { - MachineInstr *MI = *It; + void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, + unsigned Addr) const { + for (MachineInstr *MI : MIs) { CounterPropagateAddr(MI, Addr); } } @@ -477,18 +475,19 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); - TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - CFStack CFStack(ST, MFI->ShaderType); + CFStack CFStack(ST, MFI->getShaderType()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; std::vector<MachineInstr * > IfThenElseStack; - if (MFI->ShaderType == 1) { + if (MFI->getShaderType() == ShaderType::VERTEX) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; @@ -542,7 +541,7 @@ public: std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); - LoopStack.push_back(Pair); + LoopStack.push_back(std::move(Pair)); MI->eraseFromParent(); CfCount++; break; @@ -550,7 +549,7 @@ public: case AMDGPU::ENDLOOP: { CFStack.popLoop(); std::pair<unsigned, std::set<MachineInstr *> > Pair = - LoopStack.back(); + std::move(LoopStack.back()); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index f2f28fe..51d87ed 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef R600DEFINES_H_ -#define R600DEFINES_H_ +#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H +#define LLVM_LIB_TARGET_R600_R600DEFINES_H #include "llvm/MC/MCRegisterInfo.h" @@ -168,4 +168,4 @@ namespace OpName { #define R_0288E8_SQ_LDS_ALLOC 0x0288E8 -#endif // R600DEFINES_H_ +#endif diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 38afebe..fdc2030 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -19,6 +19,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -297,7 +298,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index 732b06d..211d392 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -19,6 +19,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -65,7 +66,7 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, } bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); const R600RegisterInfo &TRI = TII->getRegisterInfo(); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 7f3560a..a214e53 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -19,6 +19,7 @@ #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -82,6 +83,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::f32, Expand); @@ -189,7 +192,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = *MI; const R600InstrInfo *TII = - static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo()); switch (MI->getOpcode()) { default: @@ -199,7 +202,10 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) + // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add + // LDS_1A2D support and remove this special case. + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || + MI->getOpcode() == AMDGPU::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), @@ -642,8 +648,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const MachineSDNode *interp; if (ijb < 0) { const MachineFunction &MF = DAG.getMachineFunction(); - const R600InstrInfo *TII = - static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); + const R600InstrInfo *TII = static_cast<const R600InstrInfo *>( + MF.getSubtarget().getInstrInfo()); interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); return DAG.getTargetExtractSubreg( @@ -803,6 +809,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); + case Intrinsic::AMDGPU_read_workdim: + return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); + case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T1_X, VT); @@ -839,8 +848,20 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, default: AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); return; - case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + case ISD::FP_TO_UINT: + if (N->getValueType(0) == MVT::i1) { + Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + } + // Fall-through. Since we don't care about out of bounds values + // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint + // considers some extra cases which are not necessary here. + case ISD::FP_TO_SINT: { + SDValue Result; + if (expandFP_TO_SINT(N, Result, DAG)) + Results.push_back(Result); return; + } case ISD::UDIV: { SDValue Op = SDValue(N, 0); SDLoc DL(Op); @@ -886,74 +907,7 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::UDIVREM: { SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, HalfVT); - SDValue zero = DAG.getConstant(0, HalfVT); - - //HiLo split - SDValue LHS = N->getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = N->getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Hi = zero; - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit - SDValue HBit; - if (halfBitWidth == 32 && Subtarget->hasBFE()) { - HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); - } else { - HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - } - - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); - } - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); + LowerUDIVREM64(Op, DAG, Results); break; } } @@ -1415,8 +1369,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( - getTargetMachine().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + getTargetMachine().getSubtargetImpl()->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1512,10 +1466,23 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(Ops, DL); } + // Lower loads constant address space global variable loads + if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa<GlobalVariable>( + GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) { + + SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), + LoadNode->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + } if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { - SplitVectorLoad(Op, DAG), + ScalarizeVectorLoad(Op, DAG), Chain }; return DAG.getMergeValues(MergedValues, DL); @@ -1585,6 +1552,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const LoadNode->getPointerInfo(), MemVT, LoadNode->isVolatile(), LoadNode->isNonTemporal(), + LoadNode->isInvariant(), LoadNode->getAlignment()); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); @@ -1599,8 +1567,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( - getTargetMachine().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + getTargetMachine().getSubtargetImpl()->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1663,10 +1631,10 @@ SDValue R600TargetLowering::LowerFormalArguments( SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); MachineFunction &MF = DAG.getMachineFunction(); - unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType; + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); SmallVector<ISD::InputArg, 8> LocalIns; @@ -1676,10 +1644,15 @@ SDValue R600TargetLowering::LowerFormalArguments( for (unsigned i = 0, e = Ins.size(); i < e; ++i) { CCValAssign &VA = ArgLocs[i]; - EVT VT = Ins[i].VT; - EVT MemVT = LocalIns[i].VT; + const ISD::InputArg &In = Ins[i]; + EVT VT = In.VT; + EVT MemVT = VA.getLocVT(); + if (!VT.isVector() && MemVT.isVector()) { + // Get load source type if scalarized. + MemVT = MemVT.getVectorElementType(); + } - if (ShaderType != ShaderType::COMPUTE) { + if (MFI->getShaderType() != ShaderType::COMPUTE) { unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); @@ -1687,7 +1660,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); + AMDGPUAS::CONSTANT_BUFFER_0); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up @@ -1696,18 +1669,33 @@ SDValue R600TargetLowering::LowerFormalArguments( // The first 36 bytes of the input buffer contains information about // thread group and global sizes. + ISD::LoadExtType Ext = ISD::NON_EXTLOAD; + if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { + // FIXME: This should really check the extload type, but the handling of + // extload vector parameters seems to be broken. + + // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + Ext = ISD::SEXTLOAD; + } + + // Compute the offset from the value. + // XXX - I think PartOffset should give you this, but it seems to give the + // size of the register which isn't useful. + + unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset(); + unsigned PartOffset = VA.getLocMemOffset(); + unsigned Offset = 36 + VA.getLocMemOffset(); - // FIXME: This should really check the extload type, but the handling of - // extload vecto parameters seems to be broken. - //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - ISD::LoadExtType Ext = ISD::SEXTLOAD; - SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain, - DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), - MachinePointerInfo(UndefValue::get(PtrTy)), - MemVT, false, false, 4); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); + SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, MVT::i32), + DAG.getUNDEF(MVT::i32), + PtrInfo, + MemVT, false, true, true, 4); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); + MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); } return Chain; } @@ -2053,7 +2041,7 @@ static bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); if (!Src.isMachineOpcode()) return false; switch (Src.getMachineOpcode()) { @@ -2178,7 +2166,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); if (!Node->isMachineOpcode()) return Node; unsigned Opcode = Node->getMachineOpcode(); diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index d22c8c9..10ebc10 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600ISELLOWERING_H -#define R600ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H #include "AMDGPUISelLowering.h" @@ -74,4 +74,4 @@ private: } // End namespace llvm; -#endif // R600ISELLOWERING_H +#endif diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td index 9428bab..0ffd485 100644 --- a/lib/Target/R600/R600InstrFormats.td +++ b/lib/Target/R600/R600InstrFormats.td @@ -38,6 +38,9 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, let Pattern = pattern; let Itinerary = itin; + // No AsmMatcher support. + let isCodeGenOnly = 1; + let TSFlags{4} = Trig; let TSFlags{5} = Op3; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 3972e2f..653fd0d 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -92,10 +92,6 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, return true; } -unsigned R600InstrInfo::getIEQOpcode() const { - return AMDGPU::SETE_INT; -} - bool R600InstrInfo::isMov(unsigned Opcode) const { @@ -209,8 +205,10 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { } bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { - const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>(); - return MFI->ShaderType != ShaderType::COMPUTE && usesVertexCache(MI->getOpcode()); + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + return MFI->getShaderType() != ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode()); } bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { @@ -218,9 +216,11 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { } bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { - const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>(); - return (MFI->ShaderType == ShaderType::COMPUTE && usesVertexCache(MI->getOpcode())) || - usesTextureCache(MI->getOpcode()); + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + return (MFI->getShaderType() == ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); } bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { @@ -319,7 +319,7 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); continue; } - + } return Result; } @@ -571,7 +571,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, if (!isLastAluTrans) return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); - TransOps = IGSrcs.back(); + TransOps = std::move(IGSrcs.back()); IGSrcs.pop_back(); ValidSwizzle.pop_back(); @@ -654,10 +654,10 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) return fitsConstReadLimitations(Consts); } -DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM, - const ScheduleDAG *DAG) const { - const InstrItineraryData *II = TM->getInstrItineraryData(); - return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II); +DFAPacketizer * +R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II); } static bool @@ -1082,9 +1082,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering*>( - MF.getTarget().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + MF.getSubtarget().getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); int End = getIndirectIndexEnd(MF); diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 45a57d3..d3dc0e5 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600INSTRUCTIONINFO_H_ -#define R600INSTRUCTIONINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H #include "AMDGPUInstrInfo.h" #include "R600Defines.h" @@ -152,11 +152,10 @@ namespace llvm { /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; - unsigned getIEQOpcode() const override; bool isMov(unsigned Opcode) const override; - DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM, - const ScheduleDAG *DAG) const override; + DFAPacketizer * + CreateTargetScheduleState(const TargetSubtargetInfo &) const override; bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; @@ -207,7 +206,7 @@ namespace llvm { int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const override { return 1;} - virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, @@ -299,4 +298,4 @@ int getLDSNoRetOp(uint16_t Opcode); } // End llvm namespace -#endif // R600INSTRINFO_H_ +#endif diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 73fa345..b6c00f8 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -216,7 +216,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, def TEX_SHADOW : PatLeaf< (imm), [{uint32_t TType = (uint32_t)N->getZExtValue(); - return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13); + return (TType >= 6 && TType <= 8) || TType == 13; }] >; @@ -475,13 +475,13 @@ class ExportBufWord1 { multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), 0, 61, 0, 7, 7, 7, cf_inst, 0) >; def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), 0, 61, 7, 0, 7, 7, cf_inst, 0) >; @@ -513,17 +513,17 @@ multiclass SteamOutputExportPattern<Instruction ExportInst, // Stream1 def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf1inst, 0)>; // Stream2 def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf2inst, 0)>; // Stream3 def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf3inst, 0)>; } @@ -674,8 +674,9 @@ def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; // Non-IEEE MUL: 0 * anything = 0 def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; -def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>; -def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; +// TODO: Do these actually match the regular fmin/fmax behavior? +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; // For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, // so some of the instruction names don't match the asm string. @@ -915,6 +916,11 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] >; +class FMA_Common <bits<5> inst> : R600_3OP < + inst, "FMA", + [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU +>; + class CNDE_Common <bits<5> inst> : R600_3OP < inst, "CNDE", [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] @@ -1068,7 +1074,7 @@ class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < } class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < - inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] > { let Itinerary = TransALU; } @@ -1114,6 +1120,7 @@ def FNEG_R600 : FNEG<R600_Reg32>; // Helper patterns for complex intrinsics //===----------------------------------------------------------------------===// +// FIXME: Should be predicated on unsafe fp math. multiclass DIV_Common <InstR600 recip_ieee> { def : Pat< (int_AMDGPU_div f32:$src0, f32:$src1), @@ -1124,6 +1131,8 @@ def : Pat< (fdiv f32:$src0, f32:$src1), (MUL_IEEE $src0, (recip_ieee $src1)) >; + +def : RcpPat<recip_ieee, f32>; } class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> @@ -1133,9 +1142,12 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie >; // FROUND pattern -class FROUNDPat<Instruction CNDGE> : Pat < +class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat < (AMDGPUround f32:$x), - (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) + (CNDGE $x, + (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)), + (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) + ) >; @@ -1180,7 +1192,9 @@ let Predicates = [isR600] in { def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - def : FROUNDPat <CNDGE_r600>; + defm : RsqPat<RECIPSQRT_IEEE_r600, f32>; + + def : FROUNDPat <CNDGE_r600, CNDGT_r600>; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT @@ -1482,6 +1496,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let isCodeGenOnly = 1; } multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index b0ae22e..263561e 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef R600MACHINEFUNCTIONINFO_H -#define R600MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" #include "llvm/ADT/BitVector.h" @@ -31,4 +31,4 @@ public: } // End llvm namespace -#endif //R600MACHINEFUNCTIONINFO_H +#endif diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 7ea654c..d782713 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -14,7 +14,6 @@ #include "R600MachineScheduler.h" #include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/PassManager.h" @@ -76,21 +75,25 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { float ALUFetchRationEstimate = (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / (FetchInstCount + Available[IDFetch].size()); - unsigned NeededWF = 62.5f / ALUFetchRationEstimate; - DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); - // We assume the local GPR requirements to be "dominated" by the requirement - // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and - // after TEX are indeed likely to consume or generate values from/for the - // TEX clause. - // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause - // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need - // one GPR) or TmXYZW = TnXYZW (need 2 GPR). - // (TODO : use RegisterPressure) - // If we are going too use too many GPR, we flush Fetch instruction to lower - // register pressure on 128 bits regs. - unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); - if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + if (ALUFetchRationEstimate == 0) { AllowSwitchFromAlu = true; + } else { + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } } if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h index fd475af..fc5b95c 100644 --- a/lib/Target/R600/R600MachineScheduler.h +++ b/lib/Target/R600/R600MachineScheduler.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600MACHINESCHEDULER_H_ -#define R600MACHINESCHEDULER_H_ +#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H #include "R600InstrInfo.h" #include "llvm/ADT/PriorityQueue.h" diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp index 2314136..742c0e0 100644 --- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "R600InstrInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -279,9 +280,8 @@ bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, continue; if (PreviousRegSeqByReg[MOp->getReg()].empty()) continue; - std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()]; - for (unsigned i = 0, e = MIs.size(); i < e; i++) { - CompatibleRSI = PreviousRegSeq[MIs[i]]; + for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { + CompatibleRSI = PreviousRegSeq[MI]; if (RSI == CompatibleRSI) continue; if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) @@ -314,7 +314,7 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { } bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo()); MRI = &(Fn.getRegInfo()); for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 74cf309..ddf68c9 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -148,11 +148,11 @@ private: } public: // Ctor. - R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, - MachineDominatorTree &MDT) - : VLIWPacketizerList(MF, MLI, MDT, true), - TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())), - TRI(TII->getRegisterInfo()) { + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + : VLIWPacketizerList(MF, MLI, true), + TII(static_cast<const R600InstrInfo *>( + MF.getSubtarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); } @@ -328,12 +328,11 @@ public: }; bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); - MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); // Instantiate the packetizer. - R600PacketizerList Packetizer(Fn, MLI, MDT); + R600PacketizerList Packetizer(Fn, MLI); // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index 247808b..f1a8a41 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600REGISTERINFO_H_ -#define R600REGISTERINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H #include "AMDGPURegisterInfo.h" @@ -46,4 +46,4 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { } // End namespace llvm -#endif // AMDIDSAREGISTERINFO_H_ +#endif diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 4d31a11..2e7dab6 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -8,10 +8,11 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef SIDEFINES_H_ -#define SIDEFINES_H_ +#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H +#define LLVM_LIB_TARGET_R600_SIDEFINES_H namespace SIInstrFlags { +// This needs to be kept in sync with the field bits in InstSI. enum { MIMG = 1 << 3, SMRD = 1 << 4, @@ -19,10 +20,38 @@ enum { VOP2 = 1 << 6, VOP3 = 1 << 7, VOPC = 1 << 8, - SALU = 1 << 9 + SALU = 1 << 9, + MUBUF = 1 << 10, + MTBUF = 1 << 11, + FLAT = 1 << 12 }; } +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding + VM_CNT = 1 << 0, + EXP_CNT = 1 << 1, + LGKM_CNT = 1 << 2 + }; +} + +namespace SISrcMods { + enum { + NEG = 1 << 0, + ABS = 1 << 1 + }; +} + +namespace SIOutMods { + enum { + NONE = 0, + MUL2 = 1, + MUL4 = 2, + DIV2 = 3 + }; +} + #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) @@ -32,6 +61,7 @@ enum { #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC @@ -85,4 +115,7 @@ enum { #define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) -#endif // SIDEFINES_H_ +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) + +#endif diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp index 5f71453..d6f4b4c 100644 --- a/lib/Target/R600/SIFixSGPRCopies.cpp +++ b/lib/Target/R600/SIFixSGPRCopies.cpp @@ -66,6 +66,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -195,10 +196,10 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( - MF.getTarget().getRegisterInfo()); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - MF.getTarget().getInstrInfo()); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -237,14 +238,66 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { // If a PHI node defines an SGPR and any of its operands are VGPRs, // then we need to move it to the VALU. + // + // Also, if a PHI node defines an SGPR and has all SGPR operands + // we must move it to the VALU, because the SGPR operands will + // all end up being assigned the same register, which means + // there is a potential for a conflict if different threads take + // different control flow paths. + // + // For Example: + // + // sgpr0 = def; + // ... + // sgpr1 = def; + // ... + // sgpr2 = PHI sgpr0, sgpr1 + // use sgpr2; + // + // Will Become: + // + // sgpr2 = def; + // ... + // sgpr2 = def; + // ... + // use sgpr2 + // + // FIXME: This is OK if the branching decision is made based on an + // SGPR value. + bool SGPRBranch = false; + + // The one exception to this rule is when one of the operands + // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK + // instruction. In this case, there we know the program will + // never enter the second block (the loop) without entering + // the first block (where the condition is computed), so there + // is no chance for values to be over-written. + + bool HasBreakDef = false; for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { unsigned Reg = MI.getOperand(i).getReg(); if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { TII->moveToVALU(MI); break; } + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch(DefInstr->getOpcode()) { + + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + // If we see a PHI instruction that defines an SGPR, then that PHI + // instruction has already been considered and should have + // a *_BREAK as an operand. + case AMDGPU::PHI: + HasBreakDef = true; + break; + } } + if (!SGPRBranch && !HasBreakDef) + TII->moveToVALU(MI); break; } case AMDGPU::REG_SEQUENCE: { @@ -252,8 +305,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { !hasVGPROperands(MI, TRI)) continue; - DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n"); - DEBUG(MI.print(dbgs())); + DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); TII->moveToVALU(MI); break; @@ -265,8 +317,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { - DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n"); - DEBUG(MI.print(dbgs())); + DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI); } break; @@ -274,5 +325,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } } } - return false; + + return true; } diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp index 7d116ee..f34c375 100644 --- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp +++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -9,18 +9,49 @@ // /// \file /// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define. +/// of the registers they define in some cases. /// -/// The strategy is to view the entire program as if it were a single basic -/// block and calculate the intervals accordingly. We implement this -/// by walking this list of segments for each LiveRange and setting the -/// end of each segment equal to the start of the segment that immediately -/// follows it. +/// The main case we need to handle is when a def is used in one side of a +/// branch and not another. For example: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// +/// Here we need the register allocator to avoid assigning any of the defs +/// inside of the IF to the same register as %def. In traditional live +/// interval analysis %def is not live inside the IF branch, however, since +/// SALU instructions inside of IF will be executed even if the branch is not +/// taken, there is the chance that one of the instructions will overwrite the +/// value of %def, so the use in ELSE will see the wrong value. +/// +/// The strategy we use for solving this is to add an extra use after the ENDIF: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// %use +/// +/// Adding this use will make the def live thoughout the IF branch, which is +/// what we want. #include "AMDGPU.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" @@ -40,16 +71,15 @@ public: initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const override { + const char *getPassName() const override { return "SI Fix SGPR live ranges"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreserved<SlotIndexes>(); + AU.addRequired<MachinePostDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -60,6 +90,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) @@ -73,36 +104,86 @@ FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( - MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo()); LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges; + + // First pass, collect all live intervals for SGPRs + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.defs()) { + if (MO.isImplicit()) + continue; + unsigned Def = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Def)) { + if (TRI->isSGPRClass(MRI.getRegClass(Def))) + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getInterval(Def))); + } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getRegUnit(Def))); + } + } + } + } + // Second pass fix the intervals for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC); - if (ExecUse) + if (MBB.succ_size() < 2) + continue; + + // We have structured control flow, so number of succesors should be two. + assert(MBB.succ_size() == 2); + MachineBasicBlock *SuccA = *MBB.succ_begin(); + MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); + + if (!NCD) + continue; + + MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); + + if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { + assert(NCD->succ_size() == 2); + // We want to make sure we insert the Use after the ENDIF, not after + // the ELSE. + NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), + *(++NCD->succ_begin())); + } + assert(SuccA && SuccB); + for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) { + unsigned Reg = RegLR.first; + LiveRange *LR = RegLR.second; + + // FIXME: We could be smarter here. If the register is Live-In to + // one block, but the other doesn't have any SGPR defs, then there + // won't be a conflict. Also, if the branch decision is based on + // a value in an SGPR, then there will be no conflict. + bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); + bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); + + if ((!LiveInToA && !LiveInToB) || + (LiveInToA && LiveInToB)) continue; - for (const MachineOperand &Def : MI.operands()) { - if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg())) - continue; - - const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg()); - - if (!TRI->isSGPRClass(RC)) - continue; - LiveInterval &LI = LIS->getInterval(Def.getReg()); - for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) { - LiveRange::Segment &Seg = LI.segments[i]; - LiveRange::Segment &Next = LI.segments[i + 1]; - Seg.end = Next.start; - } - } + // This interval is live in to one successor, but not the other, so + // we need to update its range so it is live in to both. + DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << + " BB#" << SuccA->getNumber() << ", BB#" << + SuccB->getNumber() << + " with NCD = " << NCD->getNumber() << '\n'); + + // FIXME: Need to figure out how to update LiveRange here so this pass + // will be able to preserve LiveInterval analysis. + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + DEBUG(NCD->getFirstNonPHI()->dump()); } } diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index b13c3b8..8d4164a 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -12,6 +12,12 @@ // //===----------------------------------------------------------------------===// +#ifdef _MSC_VER +// Provide M_PI. +#define _USE_MATH_DEFINES +#include <cmath> +#endif + #include "SIISelLowering.h" #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" @@ -19,6 +25,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -46,10 +53,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); - addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); computeRegisterProperties(); @@ -80,8 +87,15 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SUBC, MVT::i32, Legal); setOperationAction(ISD::SUBE, MVT::i32, Legal); + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + // We need to custom lower vector stores from local memory - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); @@ -89,12 +103,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); - // We need to custom lower loads/stores from private memory - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::LOAD, MVT::v8i32, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); @@ -114,6 +122,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -126,8 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); - + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -179,6 +188,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 }; + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i1, Promote); + for (MVT VT : VecTypes) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { @@ -188,10 +200,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : case ISD::BITCAST: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: - case ISD::CONCAT_VECTORS: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; default: setOperationAction(Op, VT, Expand); break; @@ -213,16 +227,37 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FRINT, MVT::f64, Legal); } - // FIXME: These should be removed and handled the same was as f32 fneg. Source - // modifiers also work for the double instructions. - setOperationAction(ISD::FNEG, MVT::f64, Expand); - setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::UINT_TO_FP); + // All memory operations. Some folding on the pointer operand is done to help + // matching the constant offsets in the addressing modes. + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); + setTargetDAGCombine(ISD::ATOMIC_SWAP); + setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); + setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); + setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + setSchedulingPreference(Sched::RegPressure); } @@ -230,15 +265,63 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : // TargetLowering queries //===----------------------------------------------------------------------===// -bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - bool *IsFast) const { +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + +// FIXME: This really needs an address space argument. The immediate offset +// size is different for different sets of memory instruction sets. + +// The single offset DS instructions have a 16-bit unsigned byte offset. +// +// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r + +// r + i with addr64. 32-bit has more addressing mode options. Depending on the +// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i). +// +// SMRD instructions have an 8-bit, dword offset. +// +bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // Allow a 16-bit unsigned immediate field, since this is what DS instructions + // use. + if (!isUInt<16>(AM.BaseOffs)) + return false; + + // Only support r+r, + switch (AM.Scale) { + case 0: // "r+i" or just "i", depending on HasBaseReg. + break; + case 1: + if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. + return false; + // Otherwise we have r+r or r+i. + break; + case 2: + if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. + return false; + // Allow 2*r as r+r. + break; + default: // Don't allow n * r + return false; + } + + return true; +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { if (IsFast) *IsFast = false; - // XXX: This depends on the address space and also we may want to revist - // the alignment values we specify in the DataLayout. - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, // which isn't a simple VT. if (!VT.isSimple() || VT == MVT::Other) @@ -248,28 +331,44 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, // see what for specifically. The wording everywhere else seems to be the // same. - // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have - // no alignment restrictions. - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - // Using any pair of GPRs should be the same as any other pair. - if (IsFast) - *IsFast = true; - return VT.bitsGE(MVT::i64); - } - // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) - return false; + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + return Align % 4 == 0; + } // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. if (IsFast) *IsFast = true; return VT.bitsGT(MVT::i32); } +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: Should account for address space here. + + // The default fallback uses the private pointer size as a guess for a type to + // use. Make sure we switch these to 64-bit accesses. + + if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + return MVT::v4i32; + + if (Size >= 8 && DstAlign >= 4) + return MVT::v2i32; + + // Use the default. + return MVT::Other; +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -280,25 +379,37 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); return TII->isInlineConstant(Imm); } SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc DL, SDValue Chain, + SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { + const DataLayout *DL = getDataLayout(); + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_ADDRESS); - SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, - MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, + MRI.getLiveInVirtReg(InputPtrReg), MVT::i64); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr, DAG.getConstant(Offset, MVT::i64)); - return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr, - MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, - false, false, MemVT.getSizeInBits() >> 3); - + SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + + return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, + VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + DL->getABITypeAlignment(Ty)); // Alignment } SDValue SITargetLowering::LowerFormalArguments( @@ -309,7 +420,9 @@ SDValue SITargetLowering::LowerFormalArguments( SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetMachine &TM = getTargetMachine(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); @@ -318,20 +431,20 @@ SDValue SITargetLowering::LowerFormalArguments( assert(CallConv == CallingConv::C); SmallVector<ISD::InputArg, 16> Splits; - uint32_t Skipped = 0; + BitVector Skipped(Ins.size()); for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && !Arg.Flags.isByVal()) { assert((PSInputNum <= 15) && "Too many PS inputs!"); if (!Arg.Used) { // We can savely skip PS inputs - Skipped |= 1 << i; + Skipped.set(i); ++PSInputNum; continue; } @@ -340,7 +453,7 @@ SDValue SITargetLowering::LowerFormalArguments( } // Second split vertices into their elements - if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { + if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { ISD::InputArg NewArg = Arg; NewArg.Flags.setSplit(); NewArg.VT = Arg.VT.getVectorElementType(); @@ -356,30 +469,51 @@ SDValue SITargetLowering::LowerFormalArguments( NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else if (Info->ShaderType != ShaderType::COMPUTE) { + } else if (Info->getShaderType() != ShaderType::COMPUTE) { Splits.push_back(Arg); } } SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. - if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { + if (Info->getShaderType() == ShaderType::PIXEL && + (Info->PSInputAddr & 0x7F) == 0) { Info->PSInputAddr |= 1; CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); } // The pointer to the list of arguments is stored in SGPR0, SGPR1 - if (Info->ShaderType == ShaderType::COMPUTE) { - CCInfo.AllocateReg(AMDGPU::SGPR0); - CCInfo.AllocateReg(AMDGPU::SGPR1); - MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 + if (Info->getShaderType() == ShaderType::COMPUTE) { + Info->NumUserSGPRs = 4; + + unsigned InputPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrRegLo = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned InputPtrRegHi = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchPtrRegLo = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned ScratchPtrRegHi = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); + + CCInfo.AllocateReg(InputPtrRegLo); + CCInfo.AllocateReg(InputPtrRegHi); + CCInfo.AllocateReg(ScratchPtrRegLo); + CCInfo.AllocateReg(ScratchPtrRegHi); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); } - if (Info->ShaderType == ShaderType::COMPUTE) { + if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } @@ -389,23 +523,36 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; - if (Skipped & (1 << i)) { + if (Skipped[i]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); continue; } CCValAssign &VA = ArgLocs[ArgIdx++]; - EVT VT = VA.getLocVT(); + MVT VT = VA.getLocVT(); if (VA.isMemLoc()) { VT = Ins[i].VT; EVT MemVT = Splits[i].VT; + const unsigned Offset = 36 + VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), - 36 + VA.getLocMemOffset(), - Ins[i].Flags.isSExt()); + Offset, Ins[i].Flags.isSExt()); + + const PointerType *ParamTy = + dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + InVals.push_back(Arg); + Info->ABIArgOffset = Offset + MemVT.getStoreSize(); continue; } assert(VA.isRegLoc() && "Parameter must be in a register!"); @@ -458,39 +605,13 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_ADDR64_RSRC: { - unsigned SuperReg = MI->getOperand(0).getReg(); - unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); - unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); - unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) - .addOperand(MI->getOperand(1)); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) - .addImm(0); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) - .addReg(SubRegHiLo) - .addImm(AMDGPU::sub0) - .addReg(SubRegHiHi) - .addImm(AMDGPU::sub1); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) - .addReg(SubRegLo) - .addImm(AMDGPU::sub0_sub1) - .addReg(SubRegHi) - .addImm(AMDGPU::sub2_sub3); - MI->eraseFromParent(); - break; - } case AMDGPU::V_SUB_F64: { unsigned DestReg = MI->getOperand(0).getReg(); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) @@ -498,8 +619,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( .addReg(MI->getOperand(1).getReg()) .addImm(1) // SRC1 modifiers .addReg(MI->getOperand(2).getReg()) - .addImm(0) // SRC2 modifiers - .addImm(0) // src2 .addImm(0) // CLAMP .addImm(0); // OMOD MI->eraseFromParent(); @@ -517,49 +636,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } - case AMDGPU::FABS_SI: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), - Reg) - .addImm(0x7fffffff); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(Reg); - MI->eraseFromParent(); - break; - } - case AMDGPU::FNEG_SI: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), - Reg) - .addImm(0x80000000); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(Reg); - MI->eraseFromParent(); - break; - } - case AMDGPU::FCLAMP_SI: { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64), - MI->getOperand(0).getReg()) - .addImm(0) // SRC0 modifiers - .addOperand(MI->getOperand(1)) - .addImm(0) // SRC1 modifiers - .addImm(0) // SRC1 - .addImm(1) // CLAMP - .addImm(0); // OMOD - MI->eraseFromParent(); - } } return BB; } @@ -598,148 +674,31 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { //===----------------------------------------------------------------------===// SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { - LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); - EVT VT = Op.getValueType(); - - // These loads are legal. - if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - VT.isVector() && VT.getVectorNumElements() == 2 && - VT.getVectorElementType() == MVT::i32) - return SDValue(); - - if (Op.getValueType().isVector() && - (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || - (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - Op.getValueType().getVectorNumElements() > 4))) { - return SplitVectorLoad(Op, DAG); - } else { - SDValue Result = LowerLOAD(Op, DAG); - assert((!Result.getNode() || - Result.getNode()->getNumValues() == 2) && - "Load should return a value and a chain"); - return Result; - } + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; } + case ISD::FSIN: + case ISD::FCOS: + return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntrinsicID = - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - //XXX: Hardcoded we only use two to store the pointer to the parameters. - unsigned NumUserSGPRs = 2; - switch (IntrinsicID) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); - case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); - case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); - case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); - case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); - case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); - case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); - case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); - case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); - case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); - case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); - case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); - case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR0, VT); - case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR1, VT); - case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR2, VT); - case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops [] = { - Op.getOperand(1), - Op.getOperand(2) - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, - VT.getSizeInBits() / 8, 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); - } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); - case AMDGPUIntrinsic::SI_vs_load_input: - return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - } + case ISD::GlobalAddress: { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return LowerGlobalAddress(MFI, Op, DAG); } - - case ISD::INTRINSIC_VOID: - SDValue Chain = Op.getOperand(0); - unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - - switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDLoc DL(Op); - SDValue Ops [] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getSizeInBits() / 8, 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); - } - default: - break; - } + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); } return SDValue(); } @@ -760,6 +719,14 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); + unsigned FrameIndex = FINode->getIndex(); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -810,7 +777,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, BR->getOperand(0), BRCOND.getOperand(2) }; - DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops); + SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); + DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); + BR = NewBR.getNode(); } SDValue Chain = SDValue(Result, Result->getNumValues() - 1); @@ -838,56 +807,190 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } -SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast<LoadSDNode>(Op); - SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - if (Lowered.getNode()) - return Lowered; +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return SDValue(); - } + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - EVT MemVT = Load->getMemoryVT(); + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + MVT PtrVT = getPointerTy(GSD->getAddressSpace()); - assert(!MemVT.isVector() && "Private loads should be scalarized"); - assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int"); + SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, MVT::i32)); + SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(0, MVT::i32)); + SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(1, MVT::i32)); + + SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrLo, GA); + SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrHi, DAG.getConstant(0, MVT::i32), + SDValue(Lo.getNode(), 1)); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); +} - // FIXME: REGISTER_LOAD should probably have a chain result. - SDValue Chain = Load->getChain(); - SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); +SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo()); - SDValue Ret = LoLoad.getValue(0); - if (MemVT.getSizeInBits() == 64) { - // TODO: This needs a test to make sure the right thing is happening with - // the chain. That is hard without general function support. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + switch (IntrinsicID) { + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + case Intrinsic::r600_read_local_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_X, false); + case Intrinsic::r600_read_local_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + case Intrinsic::r600_read_local_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Z, false); + + case Intrinsic::AMDGPU_read_workdim: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, + false); + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops[] = { + Op.getOperand(1), + Op.getOperand(2) + }; - SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(1, MVT::i32)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + default: + return AMDGPUTargetLowering::LowerOperation(Op, DAG); + } +} - SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, IncPtr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); +SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad); - // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - // LoLoad.getValue(1), HiLoad.getValue(1)); + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDLoc DL(Op); + SDValue Ops[] = { + Chain, + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); } + default: + return SDValue(); + } +} - SDValue Ops[] = { - Ret, - Chain - }; +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); + + if (Op.getValueType().isVector()) { + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned NumElements = Op.getValueType().getVectorNumElements(); + assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { + default: break; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + // v4 loads are supported for private and global memory. + if (NumElements <= 4) + break; + // fall-through + case AMDGPUAS::LOCAL_ADDRESS: + return ScalarizeVectorLoad(Op, DAG); + } + } - return DAG.getMergeValues(Ops, DL); + return AMDGPUTargetLowering::LowerLOAD(Op, DAG); } SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, @@ -926,6 +1029,100 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } +// Catch division cases where we can use shortcuts with rcp and rsq +// instructions. +SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && + CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + + // 1.0 / sqrt(x) -> rsq(x) + // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + if (Unsafe) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + } + + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + SDValue FastLowered = LowerFastFDIV(Op, DAG); + if (FastLowered.getNode()) + return FastLowered; + + // This uses v_rcp_f32 which does not handle denormals. Let this hit a + // selection error for now rather than do something incorrect. + if (Subtarget->hasFP32Denormals()) + return SDValue(); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); + + const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -937,79 +1134,42 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { VT.getVectorElementType() == MVT::i32) return SDValue(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return ScalarizeVectorStore(Op, DAG); + return SDValue(); + } + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; if (VT.isVector() && VT.getVectorNumElements() >= 8) - return SplitVectorStore(Op, DAG); + return ScalarizeVectorStore(Op, DAG); if (VT == MVT::i1) return DAG.getTruncStore(Store->getChain(), DL, DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), Store->getBasePtr(), MVT::i1, Store->getMemOperand()); - if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) - return SDValue(); + return SDValue(); +} - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(), - DAG.getConstant(2, MVT::i32)); - SDValue Chain = Store->getChain(); - SmallVector<SDValue, 8> Values; - - if (Store->isTruncatingStore()) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Store->getBasePtr(), - DAG.getConstant(0, MVT::i32)); - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(), - DAG.getConstant(0x3, MVT::i32)); - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, MVT::i32)); - SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(), - DAG.getConstant(Mask, MVT::i32)); - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32, - DAG.getConstant(32, MVT::i32), ShiftAmt); - SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32, - DAG.getConstant(Mask, MVT::i32), - RotrAmt); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - - Values.push_back(Dst); - } else if (VT == MVT::i64) { - for (unsigned i = 0; i < 2; ++i) { - Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - Store->getValue(), DAG.getConstant(i, MVT::i32))); - } - } else if (VT == MVT::i128) { - for (unsigned i = 0; i < 2; ++i) { - for (unsigned j = 0; j < 2; ++j) { - Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, - Store->getValue(), DAG.getConstant(i, MVT::i32)), - DAG.getConstant(j, MVT::i32))); - } - } - } else { - Values.push_back(Store->getValue()); - } +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, + DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, + DAG.getConstantFP(0.5 / M_PI, VT))); - for (unsigned i = 0; i < Values.size(); ++i) { - SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, - Ptr, DAG.getConstant(i, MVT::i32)); - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Values[i], PartPtr, - DAG.getTargetConstant(0, MVT::i32)); + switch (Op.getOpcode()) { + case ISD::FCOS: + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + case ISD::FSIN: + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + default: + llvm_unreachable("Wrong trig opcode"); } - return Chain; } //===----------------------------------------------------------------------===// @@ -1106,6 +1266,111 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, return SDValue(); } +// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) + +// This is a variant of +// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), +// +// The normal DAG combiner will do this, but only if the add has one use since +// that would increase the number of instructions. +// +// This prevents us from seeing a constant offset that can be folded into a +// memory instruction's addressing mode. If we know the resulting add offset of +// a pointer can be folded into an addressing offset, we can replace the pointer +// operand with the add of new constant offset. This eliminates one of the uses, +// and may allow the remaining use to also be simplified. +// +SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, + unsigned AddrSpace, + DAGCombinerInfo &DCI) const { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::ADD) + return SDValue(); + + const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); + if (!CN1) + return SDValue(); + + const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!CAdd) + return SDValue(); + + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); + + // If the resulting offset is too large, we can't fold it into the addressing + // mode offset. + APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); + if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + + SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); + SDValue COffset = DAG.getConstant(Offset, MVT::i32); + + return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); +} + +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case AMDGPUISD::SMAX: + return AMDGPUISD::SMAX3; + case AMDGPUISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case AMDGPUISD::SMIN: + return AMDGPUISD::SMIN3; + case AMDGPUISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -1114,20 +1379,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SELECT_CC: { - ConstantSDNode *True, *False; - // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) - if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) - && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) - && True->isAllOnesValue() - && False->isNullValue() - && VT == MVT::i1) { - return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), - N->getOperand(1), N->getOperand(4)); - - } - break; - } case ISD::SETCC: { SDValue Arg0 = N->getOperand(0); SDValue Arg1 = N->getOperand(1); @@ -1147,6 +1398,17 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case AMDGPUISD::SMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMAX: + case AMDGPUISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: @@ -1171,16 +1433,151 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); + + case ISD::FADD: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + if (VT != MVT::f32) + break; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // These should really be instruction patterns, but writing patterns with + // source modiifiers is a pain. + + // fadd (fadd (a, a), b) -> mad 2.0, a, b + if (LHS.getOpcode() == ISD::FADD) { + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS); + } + } + + // fadd (b, fadd (a, a)) -> mad 2.0, a, b + if (RHS.getOpcode() == ISD::FADD) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS); + } + } + + break; + } + case ISD::FSUB: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + + // Try to get the fneg to fold into the source modifier. This undoes generic + // DAG combines and folds them into the mad. + if (VT == MVT::f32) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::FMUL) { + // (fsub (fmul a, b), c) -> mad a, b, (fneg c) + + SDValue A = LHS.getOperand(0); + SDValue B = LHS.getOperand(1); + SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); + } + + if (RHS.getOpcode() == ISD::FMUL) { + // (fsub c, (fmul a, b)) -> mad (fneg a), b, c + + SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); + SDValue B = RHS.getOperand(1); + SDValue C = LHS; + + return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); + } + + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); + SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS); + } + } + + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c + + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue NegTwo = DAG.getTargetConstantFP(-2.0, MVT::f32); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS); + } + } + } + + break; } } + case ISD::LOAD: + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + if (DCI.isBeforeLegalize()) + break; + + MemSDNode *MemNode = cast<MemSDNode>(N); + SDValue Ptr = MemNode->getBasePtr(); + // TODO: We could also do this for multiplies. + unsigned AS = MemNode->getAddressSpace(); + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (NewPtr) { + SmallVector<SDValue, 8> NewOps; + for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I) + NewOps.push_back(MemNode->getOperand(I)); + + NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); + } + } + break; + } + } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// \brief Test if RegClass is one of the VSrc classes static bool isVSrc(unsigned RegClass) { - return AMDGPU::VSrc_32RegClassID == RegClass || - AMDGPU::VSrc_64RegClassID == RegClass; + switch(RegClass) { + default: return false; + case AMDGPU::VSrc_32RegClassID: + case AMDGPU::VCSrc_32RegClassID: + case AMDGPU::VSrc_64RegClassID: + case AMDGPU::VCSrc_64RegClassID: + return true; + } } /// \brief Test if RegClass is one of the SSrc classes @@ -1227,8 +1624,8 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, bool &ScalarSlotUsed) const { MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); if (!Mov || !TII->isMov(Mov->getMachineOpcode())) return false; @@ -1262,8 +1659,8 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, const TargetRegisterClass *SITargetLowering::getRegClassForNode( SelectionDAG &DAG, const SDValue &Op) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); if (!Op->isMachineOpcode()) { @@ -1292,10 +1689,9 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( // If the COPY_TO_REGCLASS instruction is copying to a VSrc register // class, then the register class for the value could be either a // VReg or and SReg. In order to get a more accurate - if (OpClassID == AMDGPU::VSrc_32RegClassID || - OpClassID == AMDGPU::VSrc_64RegClassID) { + if (isVSrc(OpClassID)) return getRegClassForNode(DAG, Op.getOperand(0)); - } + return TRI.getRegClass(OpClassID); case AMDGPU::EXTRACT_SUBREG: { int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -1315,7 +1711,8 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( /// \brief Does "Op" fit into register class "RegClass" ? bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, unsigned RegClass) const { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); if (!RC) { return false; @@ -1323,37 +1720,6 @@ bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, return TRI->getRegClass(RegClass)->hasSubClassEq(RC); } -/// \brief Make sure that we don't exeed the number of allowed scalars -void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, - unsigned RegClass, - bool &ScalarSlotUsed) const { - - // First map the operands register class to a destination class - if (RegClass == AMDGPU::VSrc_32RegClassID) - RegClass = AMDGPU::VReg_32RegClassID; - else if (RegClass == AMDGPU::VSrc_64RegClassID) - RegClass = AMDGPU::VReg_64RegClassID; - else - return; - - // Nothing to do if they fit naturally - if (fitsRegClass(DAG, Operand, RegClass)) - return; - - // If the scalar slot isn't used yet use it now - if (!ScalarSlotUsed) { - ScalarSlotUsed = true; - return; - } - - // This is a conservative aproach. It is possible that we can't determine the - // correct register class and copy too often, but better safe than sorry. - SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), - Operand.getValueType(), Operand, RC); - Operand = SDValue(Node, 0); -} - /// \returns true if \p Node's operands are different from the SDValue list /// \p Ops static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { @@ -1365,14 +1731,15 @@ static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { return false; } -/// \brief Try to fold the Nodes operands into the Node -SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, - SelectionDAG &DAG) const { - +/// TODO: This needs to be removed. It's current primary purpose is to fold +/// immediates into operands when legal. The legalization parts are redundant +/// with SIInstrInfo::legalizeOperands which is called in a post-isel hook. +SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node, + SelectionDAG &DAG) const { // Original encoding (either e32 or e64) int Opcode = Node->getMachineOpcode(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); const MCInstrDesc *Desc = &TII->get(Opcode); unsigned NumDefs = Desc->getNumDefs(); @@ -1385,13 +1752,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, assert(!DescRev || DescRev->getNumDefs() == NumDefs); assert(!DescRev || DescRev->getNumOperands() == NumOps); - // e64 version if available, -1 otherwise - int OpcodeE64 = AMDGPU::getVOPe64(Opcode); - const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); - int InputModifiers[3] = {0}; - - assert(!DescE64 || DescE64->getNumDefs() == NumDefs); - int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; bool HaveVSrc = false, HaveSSrc = false; @@ -1421,9 +1781,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, // No scalar allowed when we have both VSrc and SSrc bool ScalarSlotUsed = HaveVSrc && HaveSSrc; + // If this instruction has an implicit use of VCC, then it can't use the + // constant bus. + for (unsigned i = 0, e = Desc->getNumImplicitUses(); i != e; ++i) { + if (Desc->ImplicitUses[i] == AMDGPU::VCC) { + ScalarSlotUsed = true; + break; + } + } + // Second go over the operands and try to fold them std::vector<SDValue> Ops; - bool Promote2e64 = false; for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; i != e && Op < NumOps; ++i, ++Op) { @@ -1438,11 +1806,9 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, // Is this a VSrc or SSrc operand? unsigned RegClass = Desc->OpInfo[Op].RegClass; if (isVSrc(RegClass) || isSSrc(RegClass)) { - // Try to fold the immediates - if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't work, make sure we don't hit the SReg limit. - ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); - } + // Try to fold the immediates. If this ends up with multiple constant bus + // uses, it will be legalized later. + foldImm(Ops[i], Immediate, ScalarSlotUsed); continue; } @@ -1464,66 +1830,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, continue; } } - - if (Immediate) - continue; - - if (DescE64) { - // Test if it makes sense to switch to e64 encoding - unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; - if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) - continue; - - int32_t TmpImm = -1; - if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || - (!fitsRegClass(DAG, Ops[i], RegClass) && - fitsRegClass(DAG, Ops[1], OtherRegClass))) { - - // Switch to e64 encoding - Immediate = -1; - Promote2e64 = true; - Desc = DescE64; - DescE64 = nullptr; - } - } - - if (!DescE64 && !Promote2e64) - continue; - if (!Operand.isMachineOpcode()) - continue; - if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) { - Ops.pop_back(); - Ops.push_back(Operand.getOperand(0)); - InputModifiers[i] = 1; - Promote2e64 = true; - if (!DescE64) - continue; - Desc = DescE64; - DescE64 = nullptr; - } - else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { - Ops.pop_back(); - Ops.push_back(Operand.getOperand(0)); - InputModifiers[i] = 2; - Promote2e64 = true; - if (!DescE64) - continue; - Desc = DescE64; - DescE64 = nullptr; - } - } - - if (Promote2e64) { - std::vector<SDValue> OldOps(Ops); - Ops.clear(); - for (unsigned i = 0; i < OldOps.size(); ++i) { - // src_modifier - Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32)); - Ops.push_back(OldOps[i]); - } - // Add the modifier flags while promoting - for (unsigned i = 0; i < 2; ++i) - Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); } // Add optional chain and glue @@ -1632,46 +1938,182 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } +/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// with frame index operands. +/// LLVM assumes that inputs are to these instructions are registers. +void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < Node->getNumOperands(); ++i) { + if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { + Ops.push_back(Node->getOperand(i)); + continue; + } + + SDLoc DL(Node); + Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, + Node->getOperand(i).getValueType(), + Node->getOperand(i)), 0)); + } + + DAG.UpdateNodeOperands(Node, Ops); +} + /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); Node = AdjustRegClass(Node, DAG); if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); - return foldOperands(Node, DAG); + if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || + Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + legalizeTargetIndependentNode(Node, DAG); + return Node; + } + + return legalizeOperands(Node, DAG); } /// \brief Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - if (!TII->isMIMG(MI->getOpcode())) + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); + + TII->legalizeOperands(MI); + + if (TII->isMIMG(MI->getOpcode())) { + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VReg_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MRI.setRegClass(VReg, RC); + return; + } + + // Replace unused atomics with the no return version. + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + if (NoRetAtomicOp != -1) { + if (!Node->hasAnyUseOfValue(0)) { + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + } + return; + } +} + +static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { + SDValue K = DAG.getTargetConstant(Val, MVT::i32); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); +} - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); - unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; +MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { +#if 1 + // XXX - Workaround for moveToVALU not handling different register class + // inserts for REG_SEQUENCE. + + // Build the half of the subregister with the constants. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), + buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32), + DAG.getTargetConstant(AMDGPU::sub1, MVT::i32) + }; + + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); + + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); +#else + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32), + DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); - const TargetRegisterClass *RC; - switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VReg_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; +#endif +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource ponter. +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const { + SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + if (RsrcDword1) { + PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG.getConstant(RsrcDword1, MVT::i32)), 0); } - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - MRI.setRegClass(VReg, RC); + SDValue DataLo = buildSMovImm32(DAG, DL, + RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); + + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + PtrLo, + DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), + PtrHi, + DAG.getTargetConstant(AMDGPU::sub1, MVT::i32), + DataLo, + DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + DataHi, + DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); +} + +MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + return buildRSRC(DAG, DL, Ptr, 0, Rsrc); } MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, @@ -1699,12 +2141,21 @@ MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, return N; } ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); - SDValue Ops[] = { - SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, - DAG.getConstant(0, MVT::i64)), 0), - N->getOperand(0), - DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) - }; + + const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64); + SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0); + MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(SDValue(RSrc, 0)); + Ops.push_back(N->getOperand(0)); + Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)); + + // Copy remaining operands so we keep any chain and glue nodes that follow + // the normal operands. + for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I) + Ops.push_back(N->getOperand(I)); + return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); } } diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index e25323a..7bf406e 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef SIISELLOWERING_H -#define SIISELLOWERING_H +#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H #include "AMDGPUISelLowering.h" #include "SIInstrInfo.h" @@ -25,9 +25,21 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue Chain, unsigned Offset, bool Signed) const; SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; + + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; bool foldImm(SDValue &Operand, int32_t &Immediate, @@ -36,20 +48,37 @@ class SITargetLowering : public AMDGPUTargetLowering { const SDValue &Op) const; bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op, unsigned RegClass) const; - void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, - unsigned RegClass, bool &ScalarSlotUsed) const; - SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const; + SDNode *legalizeOperands(MachineSDNode *N, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; static SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI); + SDValue performSHLPtrCombine(SDNode *N, + unsigned AS, + DAGCombinerInfo &DCI) const; + + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; public: SITargetLowering(TargetMachine &tm); - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, - bool *IsFast) const override; + + bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, + EVT /*VT*/) const override; + + bool isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const override; + + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -77,8 +106,19 @@ public: int32_t analyzeImmediate(const SDNode *N) const; SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const override; + void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const; + MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const; }; } // End namespace llvm -#endif //SIISELLOWERING_H +#endif diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 1733326..712d97d 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -17,6 +17,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -273,17 +275,17 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, continue; NeedWait = true; - + if (Ordered[i]) { unsigned Value = LastIssued.Array[i] - Required.Array[i]; - // adjust the value to the real hardware posibilities + // Adjust the value to the real hardware possibilities. Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); } else Counts.Array[i] = 0; - // Remember on what we have waited on + // Remember on what we have waited on. WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; } @@ -346,8 +348,9 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; - TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); - TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); MRI = &MF.getRegInfo(); diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 7cae9fc..10e0a3f 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -24,7 +24,11 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> VOP3 = 0; field bits<1> VOPC = 0; field bits<1> SALU = 0; + field bits<1> MUBUF = 0; + field bits<1> MTBUF = 0; + field bits<1> FLAT = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; let TSFlags{2} = LGKM_CNT; @@ -35,38 +39,60 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{7} = VOP3; let TSFlags{8} = VOPC; let TSFlags{9} = SALU; + let TSFlags{10} = MUBUF; + let TSFlags{11} = MTBUF; + let TSFlags{12} = FLAT; + + // Most instructions require adjustments after selection to satisfy + // operand requirements. + let hasPostISelHook = 1; } -class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { +class Enc32 { field bits<32> Inst; - let Size = 4; + int Size = 4; } -class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { +class Enc64 { field bits<64> Inst; - let Size = 8; + int Size = 8; +} + +class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP1 = 1; } class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { + InstSI <outs, ins, asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let UseNamedOperandTable = 1; + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always prefered, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + let VOP3 = 1; + + int Size = 8; + let Uses = [EXEC]; } //===----------------------------------------------------------------------===// // Scalar operations //===----------------------------------------------------------------------===// -class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32<outs, ins, asm, pattern> { +class SOP1e <bits<8> op> : Enc32 { bits<7> SDST; bits<8> SSRC0; @@ -75,16 +101,10 @@ class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{15-8} = op; let Inst{22-16} = SDST; let Inst{31-23} = 0x17d; //encoding; - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32 <outs, ins, asm, pattern> { - +class SOP2e <bits<7> op> : Enc32 { + bits<7> SDST; bits<8> SSRC0; bits<8> SSRC1; @@ -94,15 +114,9 @@ class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{22-16} = SDST; let Inst{29-23} = op; let Inst{31-30} = 0x2; // encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32<outs, ins, asm, pattern> { +class SOPCe <bits<7> op> : Enc32 { bits<8> SSRC0; bits<8> SSRC1; @@ -111,113 +125,137 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{15-8} = SSRC1; let Inst{22-16} = op; let Inst{31-23} = 0x17e; - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32 <outs, ins , asm, pattern> { +class SOPKe <bits<5> op> : Enc32 { bits <7> SDST; bits <16> SIMM16; - + let Inst{15-0} = SIMM16; let Inst{22-16} = SDST; let Inst{27-23} = op; let Inst{31-28} = 0xb; //encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 < - (outs), - ins, - asm, - pattern > { +class SOPPe <bits<7> op> : Enc32 { - bits <16> SIMM16; + bits <16> simm16; - let Inst{15-0} = SIMM16; + let Inst{15-0} = simm16; let Inst{22-16} = op; let Inst{31-23} = 0x17f; // encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; } -class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm, - list<dag> pattern> : Enc32<outs, ins, asm, pattern> { +class SMRDe <bits<5> op, bits<1> imm> : Enc32 { bits<7> SDST; bits<7> SBASE; bits<8> OFFSET; - + let Inst{7-0} = OFFSET; let Inst{8} = imm; let Inst{14-9} = SBASE{6-1}; let Inst{21-15} = SDST; let Inst{26-22} = op; let Inst{31-27} = 0x18; //encoding +} + +class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, SOP1e <op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; +} + +class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, SOP2e<op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + + let UseNamedOperandTable = 1; +} + +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, SOPCe <op> { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + + let UseNamedOperandTable = 1; +} + +class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins , asm, pattern>, SOPKe<op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + + let UseNamedOperandTable = 1; +} + +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPPe <op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + + let UseNamedOperandTable = 1; +} + +class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let LGKM_CNT = 1; let SMRD = 1; + let mayStore = 0; + let mayLoad = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; } //===----------------------------------------------------------------------===// // Vector ALU operations //===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { -class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32 <outs, ins, asm, pattern> { +class VOP1e <bits<8> op> : Enc32 { bits<8> VDST; bits<9> SRC0; - + let Inst{8-0} = SRC0; let Inst{16-9} = op; let Inst{24-17} = VDST; let Inst{31-25} = 0x3f; //encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP1 = 1; } -class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32 <outs, ins, asm, pattern> { +class VOP2e <bits<6> op> : Enc32 { bits<8> VDST; bits<9> SRC0; bits<8> VSRC1; - + let Inst{8-0} = SRC0; let Inst{16-9} = VSRC1; let Inst{24-17} = VDST; let Inst{30-25} = op; let Inst{31} = 0x0; //encoding - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP2 = 1; } -class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP3Common <outs, ins, asm, pattern> { +class VOP3e <bits<9> op> : Enc64 { bits<8> dst; bits<2> src0_modifiers; @@ -243,11 +281,9 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{61} = src0_modifiers{0}; let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - } -class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP3Common <outs, ins, asm, pattern> { +class VOP3be <bits<9> op> : Enc64 { bits<8> dst; bits<2> src0_modifiers; @@ -270,11 +306,9 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{61} = src0_modifiers{0}; let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - } -class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : - Enc32 <(outs VCCReg:$dst), ins, asm, pattern> { +class VOPCe <bits<8> op> : Enc32 { bits<9> SRC0; bits<8> VSRC1; @@ -283,16 +317,9 @@ class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : let Inst{16-9} = VSRC1; let Inst{24-17} = op; let Inst{31-25} = 0x3e; - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let VOPC = 1; } -class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc32 <outs, ins, asm, pattern> { +class VINTRPe <bits<2> op> : Enc32 { bits<8> VDST; bits<8> VSRC; @@ -305,22 +332,9 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{17-16} = op; let Inst{25-18} = VDST; let Inst{31-26} = 0x32; // encoding - - let neverHasSideEffects = 1; - let mayLoad = 1; - let mayStore = 0; } -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Vector I/O operations -//===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { - -class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { +class DSe <bits<8> op> : Enc64 { bits<8> vdst; bits<1> gds; @@ -339,12 +353,9 @@ class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{47-40} = data0; let Inst{55-48} = data1; let Inst{63-56} = vdst; - - let LGKM_CNT = 1; } -class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64<outs, ins, asm, pattern> { +class MUBUFe <bits<7> op> : Enc64 { bits<12> offset; bits<1> offen; @@ -373,16 +384,9 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{54} = slc; let Inst{55} = tfe; let Inst{63-56} = soffset; - - let VM_CNT = 1; - let EXP_CNT = 1; - - let neverHasSideEffects = 1; - let UseNamedOperandTable = 1; } -class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64<outs, ins, asm, pattern> { +class MTBUFe <bits<3> op> : Enc64 { bits<8> VDATA; bits<12> OFFSET; @@ -413,15 +417,9 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{54} = SLC; let Inst{55} = TFE; let Inst{63-56} = SOFFSET; - - let VM_CNT = 1; - let EXP_CNT = 1; - - let neverHasSideEffects = 1; } -class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { +class MIMGe <bits<7> op> : Enc64 { bits<8> VDATA; bits<4> DMASK; @@ -434,7 +432,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : bits<1> SLC; bits<8> VADDR; bits<7> SRSRC; - bits<7> SSAMP; + bits<7> SSAMP; let Inst{11-8} = DMASK; let Inst{12} = UNORM; @@ -450,19 +448,29 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{47-40} = VDATA; let Inst{52-48} = SRSRC{6-2}; let Inst{57-53} = SSAMP{6-2}; - - let VM_CNT = 1; - let EXP_CNT = 1; - let MIMG = 1; } -def EXP : Enc64< - (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), - "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] > { +class FLATe<bits<7> op> : Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; +} +class EXPe : Enc64 { bits<4> EN; bits<6> TGT; bits<1> COMPR; @@ -483,8 +491,110 @@ def EXP : Enc64< let Inst{47-40} = VSRC1; let Inst{55-48} = VSRC2; let Inst{63-56} = VSRC3; +} + +let Uses = [EXEC] in { + +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP1Common <outs, ins, asm, pattern>, + VOP1e<op>; + +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, VOP2e<op> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP2 = 1; +} + +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP3Common <outs, ins, asm, pattern>, VOP3e<op>; + +class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : + VOP3Common <outs, ins, asm, pattern>, VOP3be<op>; + +class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : + InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOPC = 1; +} + +class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, VINTRPe<op> { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Vector I/O operations +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { + +class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> , DSe<op> { + + let LGKM_CNT = 1; + let UseNamedOperandTable = 1; +} +class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, MUBUFe <op> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MUBUF = 1; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; +} + +class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { + + let VM_CNT = 1; let EXP_CNT = 1; + let MTBUF = 1; + + let neverHasSideEffects = 1; + let UseNamedOperandTable = 1; } +class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, FLATe <op> { + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; +} + +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, MIMGe <op> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MIMG = 1; + + let hasSideEffects = 0; // XXX ???? +} + + + } // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 455c890..8343362 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -17,10 +17,13 @@ #include "AMDGPUTargetMachine.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -32,6 +35,259 @@ SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) // TargetInstrInfo callbacks //===----------------------------------------------------------------------===// +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +/// \brief Returns true if both nodes have the same value for the given +/// operand \p Op, or if both nodes do not have this operand. +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { + unsigned Opc0 = N0->getMachineOpcode(); + unsigned Opc1 = N1->getMachineOpcode(); + + int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); + int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); + + if (Op0Idx == -1 && Op1Idx == -1) + return true; + + + if ((Op0Idx == -1 && Op1Idx != -1) || + (Op1Idx == -1 && Op0Idx != -1)) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --Op0Idx; + --Op1Idx; + + return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + + // Make sure both are actually loads. + if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) + return false; + + if (isDS(Opc0) && isDS(Opc1)) { + + // FIXME: Handle this case: + if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) + return false; + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // Skip read2 / write2 variants for simplicity. + // TODO: We should report true if the used offsets are adjacent (excluded + // st64 versions). + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + return false; + + Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + + // MUBUF and MTBUF have vaddr at different indices. + if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || + findChainOperand(Load0) != findChainOperand(Load1) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + + if (OffIdx0 == -1 || OffIdx1 == -1) + return false; + + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // inlcude the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + --OffIdx0; + --OffIdx1; + + SDValue Off0 = Load0->getOperand(OffIdx0); + SDValue Off1 = Load1->getOperand(OffIdx1); + + // The offset might be a FrameIndexSDNode. + if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) + return false; + + Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); + return true; + } + + return false; +} + +static bool isStride64(unsigned Opc) { + switch (Opc) { + case AMDGPU::DS_READ2ST64_B32: + case AMDGPU::DS_READ2ST64_B64: + case AMDGPU::DS_WRITE2ST64_B32: + case AMDGPU::DS_WRITE2ST64_B64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const { + unsigned Opc = LdSt->getOpcode(); + if (isDS(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (OffsetImm) { + // Normal, single offset LDS instruction. + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. We + // will use this for some partially aligned loads. + const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset1); + + uint8_t Offset0 = Offset0Imm->getImm(); + uint8_t Offset1 = Offset1Imm->getImm(); + assert(Offset1 > Offset0); + + if (Offset1 - Offset0 == 1) { + // Each of these offsets is in element sized units, so we need to convert + // to bytes of the individual reads. + + unsigned EltSize; + if (LdSt->mayLoad()) + EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + else { + assert(LdSt->mayStore()); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + } + + if (isStride64(Opc)) + EltSize *= 64; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = EltSize * Offset0; + return true; + } + + return false; + } + + if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + return false; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::vaddr); + if (!AddrReg) + return false; + + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + if (isSMRD(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (!OffsetImm) + return false; + + const MachineOperand *SBaseReg = getNamedOperand(*LdSt, + AMDGPU::OpName::sbase); + BaseReg = SBaseReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + return false; +} + +bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + unsigned Opc0 = FirstLdSt->getOpcode(); + unsigned Opc1 = SecondLdSt->getOpcode(); + + // TODO: This needs finer tuning + if (NumLoads > 4) + return false; + + if (isDS(Opc0) && isDS(Opc1)) + return true; + + if (isSMRD(Opc0) && isSMRD(Opc1)) + return true; + + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + return true; + + return false; +} + void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, @@ -182,6 +438,19 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { return Opcode; } +static bool shouldTryToSpillVGPRs(MachineFunction *MF) { + + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const TargetMachine &TM = MF->getTarget(); + + // FIXME: Even though it can cause problems, we need to enable + // spilling at -O0, since the fast register allocator always + // spills registers that are live at the end of blocks. + return MFI->getShaderType() == ShaderType::COMPUTE && + TM.getOptLevel() == CodeGenOpt::None; + +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, @@ -189,50 +458,43 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned KillFlag = isKill ? RegState::Kill : 0; + int Opcode = -1; - if (RI.hasVGPRs(RC)) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) - .addReg(SrcReg); - } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { - unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF); - unsigned TgtReg = MFI->SpillTracker.LaneVGPR; - - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg) - .addReg(SrcReg, KillFlag) - .addImm(Lane); - MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane); - } else if (RI.isSGPRClass(RC)) { + if (RI.isSGPRClass(RC)) { // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for vector - // registers. - // - // Reserve a spot in the spill tracker for each sub-register of - // the vector register. - unsigned NumSubRegs = RC->getSize() / 4; - unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs); - MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, - FirstLane); - - unsigned Opcode; + // registers, so we need to use pseudo instruction for spilling + // SGPRs. switch (RC->getSize() * 8) { - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - default: llvm_unreachable("Cannot spill register class"); + case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; } + } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + } + } - BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR) + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); + BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) - .addImm(FrameIndex); + .addFrameIndex(FrameIndex); } else { - llvm_unreachable("VGPR spilling not supported"); + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" + " spill register"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) + .addReg(SrcReg); } } @@ -242,55 +504,138 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; - if (RI.hasVGPRs(RC)) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addImm(0); - } else if (RI.isSGPRClass(RC)){ - unsigned Opcode; + if (RI.isSGPRClass(RC)){ switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - default: llvm_unreachable("Cannot spill register class"); + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; } + } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; + } + } - SIMachineFunctionInfo::SpilledReg Spill = - MFI->SpillTracker.getSpilledReg(FrameIndex); - + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addReg(Spill.VGPR) - .addImm(FrameIndex); + .addFrameIndex(FrameIndex); } else { - llvm_unreachable("VGPR spilling not supported"); + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" + " restore register"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(AMDGPU::VGPR0); } } -static unsigned getNumSubRegsForSpillOp(unsigned Op) { - - switch (Op) { - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S512_RESTORE: - return 16; - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S256_RESTORE: - return 8; - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S128_RESTORE: - return 4; - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S64_RESTORE: - return 2; - case AMDGPU::SI_SPILL_S32_RESTORE: - return 1; - default: llvm_unreachable("Invalid spill opcode"); +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, + unsigned Size) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WavefrontSize = ST.getWavefrontSize(); + + unsigned TIDReg = MFI->getTIDReg(); + if (!MFI->hasCalculatedTID()) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + + TIDReg = RI.findUnusedVGPR(MF->getRegInfo()); + if (TIDReg == AMDGPU::NoRegister) + return TIDReg; + + + if (MFI->getShaderType() == ShaderType::COMPUTE && + WorkGroupSize > WavefrontSize) { + + unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); + unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); + unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned InputPtrReg = + TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + static const unsigned TIDIGRegs[3] = { + TIDIGXReg, TIDIGYReg, TIDIGZReg + }; + for (unsigned Reg : TIDIGRegs) { + if (!Entry.isLiveIn(Reg)) + Entry.addLiveIn(Reg); + } + + RS->enterBasicBlock(&Entry); + unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Z); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Y); + + // NGROUPS.X * NGROUPS.Y + BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) + .addReg(STmp1) + .addReg(STmp0); + // (NGROUPS.X * NGROUPS.Y) * TIDIG.X + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) + .addReg(STmp1) + .addReg(TIDIGXReg); + // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) + .addReg(STmp0) + .addReg(TIDIGYReg) + .addReg(TIDReg); + // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z + BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); + } else { + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDReg) + .addImm(-1) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), + TIDReg) + .addImm(-1) + .addReg(TIDReg); + } + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDReg) + .addImm(2) + .addReg(TIDReg); + MFI->setTIDReg(TIDReg); } + + // Add FrameIndex to LDS offset + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); + + return TmpReg; } void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, @@ -308,95 +653,102 @@ void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, } bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - SIMachineFunctionInfo *MFI = - MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI->getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - // SGPR register spill - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S64_SAVE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned FrameIndex = MI->getOperand(2).getImm(); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - SIMachineFunctionInfo::SpilledReg Spill; - unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(), - &AMDGPU::SGPR_32RegClass, i); - Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), - MI->getOperand(0).getReg()) - .addReg(SubReg) - .addImm(Spill.Lane + i); - } + case AMDGPU::SI_CONSTDATA_PTR: { + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + + BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); + + // Add 32-bit offset from this instruction to the start of the constant data. + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addTargetIndex(AMDGPU::TI_CONSTDATA_START) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) + .addReg(AMDGPU::SCC, RegState::Implicit); MI->eraseFromParent(); break; } - - // SGPR register restore - case AMDGPU::SI_SPILL_S512_RESTORE: - case AMDGPU::SI_SPILL_S256_RESTORE: - case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_S64_RESTORE: - case AMDGPU::SI_SPILL_S32_RESTORE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - SIMachineFunctionInfo::SpilledReg Spill; - unsigned FrameIndex = MI->getOperand(2).getImm(); - unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg) - .addReg(MI->getOperand(1).getReg()) - .addImm(Spill.Lane + i); - } - insertNOPs(MI, 3); + case AMDGPU::SGPR_USE: + // This is just a placeholder for register allocation. MI->eraseFromParent(); break; } - } return true; } MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { + if (MI->getNumOperands() < 3) + return nullptr; - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg()) + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + assert(Src0Idx != -1 && "Should always have src0 operand"); + + MachineOperand &Src0 = MI->getOperand(Src0Idx); + if (!Src0.isReg()) + return nullptr; + + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + if (Src1Idx == -1) return nullptr; - // Cannot commute VOP2 if src0 is SGPR. - if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() && - RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg()))) - return nullptr; + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + // Make sure it's legal to commute operands for VOP2. + if (isVOP2(MI->getOpcode()) && + (!isOperandLegal(MI, Src0Idx, &Src1) || + !isOperandLegal(MI, Src1Idx, &Src0))) + return nullptr; - if (!MI->getOperand(2).isReg()) { - // XXX: Commute instructions with FPImm operands - if (NewMI || MI->getOperand(2).isFPImm() || + if (!Src1.isReg()) { + // Allow commuting instructions with Imm or FPImm operands. + if (NewMI || (!Src1.isImm() && !Src1.isFPImm()) || (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { return nullptr; } - // XXX: Commute VOP3 instructions with abs and neg set. - if (isVOP3(MI->getOpcode()) && - (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::abs)).getImm() || - MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::neg)).getImm())) - return nullptr; + // Be sure to copy the source modifiers to the right place. + if (MachineOperand *Src0Mods + = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods + = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); - unsigned Reg = MI->getOperand(1).getReg(); - unsigned SubReg = MI->getOperand(1).getSubReg(); - MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm()); - MI->getOperand(2).ChangeToRegister(Reg, false); - MI->getOperand(2).setSubReg(SubReg); + int Src0ModsVal = Src0Mods->getImm(); + if (!Src1Mods && Src0ModsVal != 0) + return nullptr; + + // XXX - This assert might be a lie. It might be useful to have a neg + // modifier with 0.0. + int Src1ModsVal = Src1Mods->getImm(); + assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + } + + unsigned Reg = Src0.getReg(); + unsigned SubReg = Src0.getSubReg(); + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else if (Src1.isFPImm()) + Src0.ChangeToFPImmediate(Src1.getFPImm()); + else + llvm_unreachable("Should only have immediates"); + + Src1.ChangeToRegister(Reg, false); + Src1.setSubReg(SubReg); } else { MI = TargetInstrInfo::commuteInstruction(MI, NewMI); } @@ -407,6 +759,44 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, return MI; } +// This needs to be implemented because the source modifiers may be inserted +// between the true commutable operands, and the base +// TargetInstrInfo::commuteInstruction uses it. +bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isCommutable()) + return false; + + unsigned Opc = MI->getOpcode(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + + // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on + // immediate. + if (!MI->getOperand(Src0Idx).isReg()) + return false; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return false; + + if (!MI->getOperand(Src1Idx).isReg()) + return false; + + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + + SrcOpIdx1 = Src0Idx; + SrcOpIdx2 = Src1Idx; + return true; +} + MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, @@ -443,10 +833,92 @@ SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, } } +static bool offsetsDoNotOverlap(int WidthA, int OffsetA, + int WidthB, int OffsetB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + return LowOffset + LowWidth <= HighOffset; +} + +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const { + unsigned BaseReg0, Offset0; + unsigned BaseReg1, Offset1; + + if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && + getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && + "read2 / write2 not expected here yet"); + unsigned Width0 = (*MIa->memoperands_begin())->getSize(); + unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + if (BaseReg0 == BaseReg1 && + offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + return true; + } + } + + return false; +} + +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + unsigned Opc0 = MIa->getOpcode(); + unsigned Opc1 = MIb->getOpcode(); + + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must load from or modify a memory location"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must load from or modify a memory location"); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + return false; + + // XXX - Can we relax this between address spaces? + if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // TODO: Should we check the address space from the MachineMemOperand? That + // would allow us to distinguish objects we know don't alias based on the + // underlying addres space, even if it was lowered to a different one, + // e.g. private accesses lowered to use MUBUF instructions on a scratch + // buffer. + if (isDS(Opc0)) { + if (isDS(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1); + } + + if (isMUBUF(Opc0) || isMTBUF(Opc0)) { + if (isMUBUF(Opc1) || isMTBUF(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isSMRD(Opc1); + } + + if (isSMRD(Opc0)) { + if (isSMRD(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + } + + if (isFLAT(Opc0)) { + if (isFLAT(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return false; + } + + return false; +} + namespace llvm { namespace AMDGPU { // Helper function generated by tablegen. We are wrapping this with -// an SIInstrInfo function that reutrns bool rather than int. +// an SIInstrInfo function that returns bool rather than int. int isDS(uint16_t Opcode); } } @@ -455,14 +927,26 @@ bool SIInstrInfo::isDS(uint16_t Opcode) const { return ::AMDGPU::isDS(Opcode) != -1; } -int SIInstrInfo::isMIMG(uint16_t Opcode) const { +bool SIInstrInfo::isMIMG(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MIMG; } -int SIInstrInfo::isSMRD(uint16_t Opcode) const { +bool SIInstrInfo::isSMRD(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SMRD; } +bool SIInstrInfo::isMUBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MUBUF; +} + +bool SIInstrInfo::isMTBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MTBUF; +} + +bool SIInstrInfo::isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; +} + bool SIInstrInfo::isVOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP1; } @@ -541,9 +1025,99 @@ static bool compareMachineOp(const MachineOperand &Op0, } } +bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; + + assert(MO.isImm() || MO.isFPImm() || MO.isTargetIndex() || MO.isFI()); + + if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) + return true; + + if (OpInfo.RegClass < 0) + return false; + + if (isLiteralConstant(MO)) + return RI.regClassCanUseLiteralConstant(OpInfo.RegClass); + + return RI.regClassCanUseInlineConstant(OpInfo.RegClass); +} + +bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords. + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } +} + +bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { + return AMDGPU::getVOPe32(Opcode) != -1; +} + +bool SIInstrInfo::hasModifiers(unsigned Opcode) const { + // The src0_modifier operand is present on all instructions + // that have modifiers. + + return AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::src0_modifiers) != -1; +} + +bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const { + const MachineOperand *Mods = getNamedOperand(MI, OpName); + return Mods && Mods->getImm(); +} + +bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO) const { + // Literal constants use the constant bus. + if (isLiteralConstant(MO)) + return true; + + if (!MO.isReg() || !MO.isUse()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); + + // FLAT_SCR is just an SGPR pair. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) + return true; + + // EXEC register uses the constant bus. + if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) + return true; + + // SGPRs use the constant bus + if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { + return true; + } + + return false; +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); @@ -557,19 +1131,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } // Make sure the register classes are correct - for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) { + for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: { - int RegClass = Desc.OpInfo[i].RegClass; - if (!RI.regClassCanUseImmediate(RegClass) && - (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) { - ErrInfo = "Expected register, but got immediate"; - return false; + if ((MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) && + !isImmOperandLegal(MI, i, MI->getOperand(i))) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } } - } break; case MCOI::OPERAND_IMMEDIATE: - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) { + // Check if this operand is an immediate. + // FrameIndex operands will be replaced by immediates, so they are + // allowed. + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() && + !MI->getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -602,27 +1179,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, unsigned SGPRUsed = AMDGPU::NoRegister; for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); - if (MO.isReg() && MO.isUse() && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - - // EXEC register uses the constant bus. - if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) - ++ConstantBusCount; - - // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - if (SGPRUsed != MO.getReg()) { + if (usesConstantBus(MRI, MO)) { + if (MO.isReg()) { + if (MO.getReg() != SGPRUsed) ++ConstantBusCount; - SGPRUsed = MO.getReg(); - } + SGPRUsed = MO.getReg(); + } else { + ++ConstantBusCount; } } - // Literal constants use the constant bus. - if (isLiteralConstant(MO)) - ++ConstantBusCount; } if (ConstantBusCount > 1) { ErrInfo = "VOP* instruction uses the constant bus more than once"; @@ -658,11 +1223,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - MI->dump(); - - const MachineOperand &Src0 = MI->getOperand(2); - const MachineOperand &Src1 = MI->getOperand(3); - const MachineOperand &Src2 = MI->getOperand(4); + const MachineOperand &Src0 = MI->getOperand(Src0Idx); + const MachineOperand &Src1 = MI->getOperand(Src1Idx); + const MachineOperand &Src2 = MI->getOperand(Src2Idx); if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { if (!compareMachineOp(Src0, Src1) && !compareMachineOp(Src0, Src2)) { @@ -685,10 +1248,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; - case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; - case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32; + case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; @@ -757,21 +1323,28 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; + MachineBasicBlock *MBB = MI->getParent(); MachineOperand &MO = MI->getOperand(OpIdx); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Opcode = AMDGPU::V_MOV_B32_e32; - if (MO.isReg()) { + if (MO.isReg()) Opcode = AMDGPU::COPY; - } else if (RI.isSGPRClass(RC)) { + else if (RI.isSGPRClass(RC)) Opcode = AMDGPU::S_MOV_B32; - } + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); + if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) + VRC = &AMDGPU::VReg_64RegClass; + else + VRC = &AMDGPU::VReg_32RegClass; + unsigned Reg = MRI.createVirtualRegister(VRC); - BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode), - Reg).addOperand(MO); + DebugLoc DL = MBB->findDebugLoc(I); + BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) + .addOperand(MO); MO.ChangeToRegister(Reg, false); } @@ -791,13 +1364,15 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), - NewSuperReg) - .addOperand(SuperReg); + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) + .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(NewSuperReg, 0, SubIdx); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), - SubReg) - .addReg(NewSuperReg, 0, SubIdx); return SubReg; } @@ -853,8 +1428,59 @@ unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, return Dst; } +bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO) const { + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const TargetRegisterClass *DefinedRC = + OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + if (!MO) + MO = &MI->getOperand(OpIdx); + + if (usesConstantBus(MRI, *MO)) { + unsigned SGPRUsed = + MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + if (usesConstantBus(MRI, MI->getOperand(i)) && + MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) { + return false; + } + } + } + + if (MO->isReg()) { + assert(DefinedRC); + const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + } + + + // Handle non-register types that are treated like immediates. + assert(MO->isImm() || MO->isFPImm() || MO->isTargetIndex() || MO->isFI()); + + if (!DefinedRC) { + // This operand expects an immediate. + return true; + } + + return isImmOperandLegal(MI, OpIdx, *MO); +} + void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), @@ -864,45 +1490,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize VOP2 if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - MachineOperand &Src0 = MI->getOperand(Src0Idx); - MachineOperand &Src1 = MI->getOperand(Src1Idx); - - // If the instruction implicitly reads VCC, we can't have any SGPR operands, - // so move any. - bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI); - if (ReadsVCC && Src0.isReg() && - RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) { + // Legalize src0 + if (!isOperandLegal(MI, Src0Idx)) legalizeOpWithMove(MI, Src0Idx); - return; - } - if (ReadsVCC && Src1.isReg() && - RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { - legalizeOpWithMove(MI, Src1Idx); + // Legalize src1 + if (isOperandLegal(MI, Src1Idx)) return; - } - // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must - // be the first operand, and there can only be one. - if (Src1.isImm() || Src1.isFPImm() || - (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) { - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - return; - } - legalizeOpWithMove(MI, Src1Idx); + // Usually src0 of VOP2 instructions allow more types of inputs + // than src1, so try to commute the instruction to decrease our + // chances of having to insert a MOV instruction to legalize src1. + if (MI->isCommutable()) { + if (commuteInstruction(MI)) + // If we are successful in commuting, then we know MI is legal, so + // we are done. + return; } + + legalizeOpWithMove(MI, Src1Idx); + return; } // XXX - Do any VOP3 instructions read VCC? // Legalize VOP3 if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx}; - unsigned SGPRReg = AMDGPU::NoRegister; + int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + for (unsigned i = 0; i < 3; ++i) { int Idx = VOP3Idx[i]; if (Idx == -1) - continue; + break; MachineOperand &MO = MI->getOperand(Idx); if (MO.isReg()) { @@ -1002,106 +1623,212 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize MUBUF* instructions // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. + int SRsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (SRsrcIdx != -1) { + // We have an MUBUF instruction + MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), + RI.getRegClass(SRsrcRC))) { + // The operands are legal. + // FIXME: We may need to legalize operands besided srsrc. + return; + } - int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::srsrc); - int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::vaddr); - if (SRsrcIdx != -1 && VAddrIdx != -1) { - const TargetRegisterClass *VAddrRC = - RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass); - - if(VAddrRC->getSize() == 8 && - MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) { - // We have a MUBUF instruction that uses a 64-bit vaddr register and - // srsrc has the incorrect register class. In order to fix this, we - // need to extract the pointer from the resource descriptor (srsrc), - // add it to the value of vadd, then store the result in the vaddr - // operand. Then, we need to set the pointer field of the resource - // descriptor to zero. + MachineBasicBlock &MBB = *MI->getParent(); + // Extract the the ptr from the resource descriptor. - MachineBasicBlock &MBB = *MI->getParent(); - MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx); - MachineOperand &VAddrOp = MI->getOperand(VAddrIdx); - unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi; - unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - - // SRsrcPtrLo = srsrc:sub0 - SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass); - - // SRsrcPtrHi = srsrc:sub1 - SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass); - - // VAddrLo = vaddr:sub0 - VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp, - &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass); - - // VAddrHi = vaddr:sub1 - VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp, - &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddrLo + // SRsrcPtrLo = srsrc:sub0 + unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass); + + // SRsrcPtrHi = srsrc:sub1 + unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass); + + // Create an empty resource descriptor + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + + // Zero64 = 0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), + Zero64) + .addImm(0); + + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatLo) + .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); + + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatHi) + .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + + // NewSRsrc = {Zero64, SRsrcFormat} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + unsigned NewVAddrLo; + unsigned NewVAddrHi; + if (VAddr) { + // This is already an ADDR64 instruction so we need to add the pointer + // extracted from the resource descriptor to the current value of VAddr. + NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + + // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtrLo) - .addReg(VAddrLo) - .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit); + .addReg(VAddr->getReg(), 0, AMDGPU::sub0) + .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - // NewVaddrHi = SRsrcPtrHi + VAddrHi + // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) .addReg(SRsrcPtrHi) - .addReg(VAddrHi) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1) .addReg(AMDGPU::VCC, RegState::ImplicitDefine) .addReg(AMDGPU::VCC, RegState::Implicit); - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); + } else { + // This instructions is the _OFFSET variant, so we need to convert it to + // ADDR64. + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); + assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF " + "with non-zero soffset is not implemented"); + (void)SOffset; + + // Create the new instruction. + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineInstr *Addr64 = + BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*SRsrc) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*Offset); + + MI->removeFromParent(); + MI = Addr64; + + NewVAddrLo = SRsrcPtrLo; + NewVAddrHi = SRsrcPtrHi; + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + } - // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); - // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); - // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + // Update the instruction to use NewVaddr + VAddr->setReg(NewVAddr); + // Update the instruction to use NewSRsrc + SRsrc->setReg(NewSRsrc); + } +} - // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); +void SIInstrInfo::splitSMRD(MachineInstr *MI, + const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const { - // Update the instruction to use NewVaddr - MI->getOperand(VAddrIdx).setReg(NewVAddr); - // Update the instruction to use NewSRsrc - MI->getOperand(SRsrcIdx).setReg(NewSRsrc); + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RegLo = MRI.createVirtualRegister(HalfRC); + unsigned RegHi = MRI.createVirtualRegister(HalfRC); + unsigned HalfSize = HalfRC->getSize(); + const MachineOperand *OffOp = + getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + + if (OffOp) { + // Handle the _IMM variant + unsigned LoOffset = OffOp->getImm(); + unsigned HiOffset = LoOffset + (HalfSize / 4); + Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) + .addOperand(*SBase) + .addImm(LoOffset); + + if (!isUInt<8>(HiOffset)) { + unsigned OffsetSGPR = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) + .addImm(HiOffset << 2); // The immediate offset is in dwords, + // but offset in register is in bytes. + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addOperand(*SBase) + .addReg(OffsetSGPR); + } else { + Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) + .addOperand(*SBase) + .addImm(HiOffset); } + } else { + // Handle the _SGPR variant + MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); + Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) + .addOperand(*SBase) + .addOperand(*SOff); + unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) + .addOperand(*SOff) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addOperand(*SBase) + .addReg(OffsetSGPR); + } + + unsigned SubLo, SubHi; + switch (HalfSize) { + case 4: + SubLo = AMDGPU::sub0; + SubHi = AMDGPU::sub1; + break; + case 8: + SubLo = AMDGPU::sub0_sub1; + SubHi = AMDGPU::sub2_sub3; + break; + case 16: + SubLo = AMDGPU::sub0_sub1_sub2_sub3; + SubHi = AMDGPU::sub4_sub5_sub6_sub7; + break; + case 32: + SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + break; + default: + llvm_unreachable("Unhandled HalfSize"); } + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) + .addOperand(MI->getOperand(0)) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); } void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { @@ -1112,7 +1839,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX2_SGPR: case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_SGPR: { unsigned NewOpcode = getVALUOp(*MI); unsigned RegOffset; unsigned ImmOffset; @@ -1159,14 +1886,44 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con .addImm(AMDGPU::sub2) .addReg(DWord3) .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(MI->getOperand(1).getReg()); - } else { - MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); - } - MI->getOperand(1).setReg(SRsrc); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + } else { + MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + } + MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + + const TargetRegisterClass *NewDstRC = + RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + break; + } + case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, + AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX16_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, + AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } } } @@ -1238,8 +1995,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->eraseFromParent(); continue; + case AMDGPU::S_BFE_I64: { + splitScalar64BitBFE(Worklist, Inst); + Inst->eraseFromParent(); + continue; + } + case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFE_I64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); } @@ -1268,17 +2030,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(Inst->getOperand(1)); - Inst->getOperand(1).ChangeToImmediate(0); - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(Size)); - // XXX - Other pointless operands. There are 4, but it seems you only need - // 3 to not hit an assertion later in MCInstLower. - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(0)); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. @@ -1297,16 +2051,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(Inst->getOperand(1)); - Inst->getOperand(1).ChangeToImmediate(0); - Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(BitWidth)); - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(0)); } // Update the destination register class. @@ -1519,6 +2266,67 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist Worklist.push_back(Second); } +void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + uint32_t Imm = Inst->getOperand(2).getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + (void) Offset; + + // Only sext_inreg cases handled. + assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && + BitWidth <= 32 && + Offset == 0 && + "Not implemented"); + + if (BitWidth < 32) { + unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) + .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) + .addImm(31) + .addReg(MidRegLo); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(MidRegLo) + .addImm(AMDGPU::sub0) + .addReg(MidRegHi) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + return; + } + + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) + .addImm(31) + .addReg(Src.getReg(), 0, AMDGPU::sub0); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(Src.getReg(), 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); +} + void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, MachineInstr *Inst) const { // Add the implict and explicit register definitions. @@ -1537,6 +2345,74 @@ void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, } } +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, + int OpIndices[3]) const { + const MCInstrDesc &Desc = get(MI->getOpcode()); + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = AMDGPU::NoRegister; + + // First we need to consider the instruction's operand requirements before + // legalizing. Some operands are required to be SGPRs, such as implicit uses + // of VCC, but we are still bound by the constant bus requirement to only use + // one. + // + // If the operand's class is an SGPR, we can never move it. + + for (const MachineOperand &MO : MI->implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + if (MO.getReg() == AMDGPU::VCC) + return AMDGPU::VCC; + + if (MO.getReg() == AMDGPU::FLAT_SCR) + return AMDGPU::FLAT_SCR; + } + + unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = OpIndices[i]; + if (Idx == -1) + break; + + const MachineOperand &MO = MI->getOperand(Idx); + if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) + SGPRReg = MO.getReg(); + + if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + UsedSGPRs[i] = MO.getReg(); + } + + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; + + // We don't have a required SGPR operand, so we have a bit more freedom in + // selecting operands to move. + + // Try to select the most used SGPR. If an SGPR is equal to one of the + // others, we choose that. + // + // e.g. + // V_FMA_F32 v0, s0, s0, s0 -> No moves + // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + + if (UsedSGPRs[0] != AMDGPU::NoRegister) { + if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[0]; + } + + if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { + if (UsedSGPRs[1] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[1]; + } + + return SGPRReg; +} + MachineInstrBuilder SIInstrInfo::buildIndirectWrite( MachineBasicBlock *MBB, MachineBasicBlock::iterator I, @@ -1600,3 +2476,12 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); } + +MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, + unsigned OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); + if (Idx == -1) + return nullptr; + + return &MI.getOperand(Idx); +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 4c204d8..3bdbc9b 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// -#ifndef SIINSTRINFO_H -#define SIINSTRINFO_H +#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H #include "AMDGPUInstrInfo.h" #include "SIRegisterInfo.h" @@ -52,9 +52,16 @@ private: void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst) const; + void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const; + + unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + public: explicit SIInstrInfo(const AMDGPUSubtarget &st); @@ -62,11 +69,30 @@ public: return RI; } + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + + bool getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const final; + + bool shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const final; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -79,19 +105,22 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; unsigned commuteOpcode(unsigned Opcode) const; MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI=false) const override; + bool NewMI = false) const override; + bool findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; bool isTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA = nullptr) const; - unsigned getIEQOpcode() const override { - llvm_unreachable("Unimplemented"); - } + bool areMemAccessesTriviallyDisjoint( + MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, @@ -100,16 +129,42 @@ public: bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; bool isDS(uint16_t Opcode) const; - int isMIMG(uint16_t Opcode) const; - int isSMRD(uint16_t Opcode) const; + bool isMIMG(uint16_t Opcode) const; + bool isSMRD(uint16_t Opcode) const; + bool isMUBUF(uint16_t Opcode) const; + bool isMTBUF(uint16_t Opcode) const; + bool isFLAT(uint16_t Opcode) const; bool isVOP1(uint16_t Opcode) const; bool isVOP2(uint16_t Opcode) const; bool isVOP3(uint16_t Opcode) const; bool isVOPC(uint16_t Opcode) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; + bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const; + + /// \brief Return true if the given offset Size in bytes can be folded into + /// the immediate offsets of a memory instruction for the given address space. + static bool canFoldOffset(unsigned OffsetSize, unsigned AS) LLVM_READNONE; + + /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. + /// This function will return false if you pass it a 32-bit instruction. + bool hasVALU32BitEncoding(unsigned Opcode) const; + + /// \brief Returns true if this operand uses the constant bus. + bool usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO) const; + + /// \brief Return true if this instruction has any modifiers. + /// e.g. src[012]_mod, omod, clamp. + bool hasModifiers(unsigned Opcode) const; + + bool hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const; + bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const override; @@ -141,10 +196,21 @@ public: /// instead of MOV. void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand + /// for \p MI. + bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO = nullptr) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; + /// \brief Split an SMRD instruction into two smaller loads of half the + // size storing the results in \p Lo and \p Hi. + void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; /// \brief Replace this instruction's opcode with the equivalent VALU @@ -175,29 +241,52 @@ public: unsigned SavReg, unsigned IndexReg) const; void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + + /// \brief Returns the operand named \p Op. If \p MI does not have an + /// operand named \c Op, this function returns nullptr. + MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + + const MachineOperand *getNamedOperand(const MachineInstr &MI, + unsigned OpName) const { + return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); + } }; namespace AMDGPU { int getVOPe64(uint16_t Opcode); + int getVOPe32(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); int getMCOpcode(uint16_t Opcode, unsigned Gen); + int getAddr64Inst(uint16_t Opcode); + int getAtomicRetOp(uint16_t Opcode); + int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - + const uint64_t RSRC_TID_ENABLE = 1LL << 55; } // End namespace AMDGPU -} // End namespace llvm +namespace SI { +namespace KernelInputOffsets { + +/// Offsets in bytes from the start of the input buffer +enum Offsets { + NGROUPS_X = 0, + NGROUPS_Y = 4, + NGROUPS_Z = 8, + GLOBAL_SIZE_X = 12, + GLOBAL_SIZE_Y = 16, + GLOBAL_SIZE_Z = 20, + LOCAL_SIZE_X = 24, + LOCAL_SIZE_Y = 28, + LOCAL_SIZE_Z = 32 +}; + +} // End namespace KernelInputOffsets +} // End namespace SI -namespace SIInstrFlags { - enum Flags { - // First 4 bits are the instruction encoding - VM_CNT = 1 << 0, - EXP_CNT = 1 << 1, - LGKM_CNT = 1 << 2 - }; -} +} // End namespace llvm -#endif //SIINSTRINFO_H +#endif diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 774c9d1..713e84e 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -7,6 +7,32 @@ // //===----------------------------------------------------------------------===// +class vop { + field bits<9> SI3; +} + +class vopc <bits<8> si> : vop { + field bits<8> SI = si; + + field bits<9> SI3 = {0, si{7-0}}; +} + +class vop1 <bits<8> si> : vop { + field bits<8> SI = si; + + field bits<9> SI3 = {1, 1, si{6-0}}; +} + +class vop2 <bits<6> si> : vop { + field bits<6> SI = si; + + field bits<9> SI3 = {1, 0, 0, si{5-0}}; +} + +class vop3 <bits<9> si> : vop { + field bits<9> SI3 = si; +} + // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum // in AMDGPUMCInstLower.h def SISubtarget { @@ -57,6 +83,10 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; +def SIconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> +>; + // Transformation function, extract the lower 32bit of a 64bit immediate def LO32 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32); @@ -132,7 +162,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{ return false; } const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); + static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { @@ -142,15 +172,81 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{ return false; }]>; +//===----------------------------------------------------------------------===// +// Custom Operands +//===----------------------------------------------------------------------===// + def FRAMEri32 : Operand<iPTR> { let MIOperandInfo = (ops i32:$ptr, i32imm:$index); } +def sopp_brtarget : Operand<OtherVT> { + let EncoderMethod = "getSOPPBrEncoding"; + let OperandType = "OPERAND_PCREL"; +} + +include "SIInstrFormats.td" + +let OperandType = "OPERAND_IMMEDIATE" in { + +def offen : Operand<i1> { + let PrintMethod = "printOffen"; +} +def idxen : Operand<i1> { + let PrintMethod = "printIdxen"; +} +def addr64 : Operand<i1> { + let PrintMethod = "printAddr64"; +} +def mbuf_offset : Operand<i16> { + let PrintMethod = "printMBUFOffset"; +} +def ds_offset : Operand<i16> { + let PrintMethod = "printDSOffset"; +} +def ds_offset0 : Operand<i8> { + let PrintMethod = "printDSOffset0"; +} +def ds_offset1 : Operand<i8> { + let PrintMethod = "printDSOffset1"; +} +def glc : Operand <i1> { + let PrintMethod = "printGLC"; +} +def slc : Operand <i1> { + let PrintMethod = "printSLC"; +} +def tfe : Operand <i1> { + let PrintMethod = "printTFE"; +} + +def omod : Operand <i32> { + let PrintMethod = "printOModSI"; +} + +def ClampMod : Operand <i1> { + let PrintMethod = "printClampSI"; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// +def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; +def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; + +def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; +def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; + +def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; +def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; //===----------------------------------------------------------------------===// // SI assembler operands @@ -159,9 +255,20 @@ def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; def SIOperand { int ZERO = 0x80; int VCC = 0x6A; + int FLAT_SCR = 0x68; } -include "SIInstrFormats.td" +def SRCMODS { + int NONE = 0; +} + +def DSTCLAMP { + int NONE = 0; +} + +def DSTOMOD { + int NONE = 0; +} //===----------------------------------------------------------------------===// // @@ -179,6 +286,35 @@ include "SIInstrFormats.td" // //===----------------------------------------------------------------------===// +class SIMCInstr <string pseudo, int subtarget> { + string PseudoInstr = pseudo; + int Subtarget = subtarget; +} + +//===----------------------------------------------------------------------===// +// EXP classes +//===----------------------------------------------------------------------===// + +class EXPCommon : InstSI< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + let EXP_CNT = 1; + let Uses = [EXEC]; +} + +multiclass EXP_m { + + let isPseudo = 1 in { + def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + } + + def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; +} + //===----------------------------------------------------------------------===// // Scalar classes //===----------------------------------------------------------------------===// @@ -204,11 +340,21 @@ class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < opName#" $dst, $src0, $src1", pattern >; +class SOP2_SELECT_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < + op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]", pattern +>; + class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern >; +class SOP2_64_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < + op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern @@ -227,27 +373,52 @@ class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_64, i64, opName, cond>; class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK < - op, (outs SReg_32:$dst), (ins i16imm:$src0), + op, (outs SReg_32:$dst), (ins u16imm:$src0), opName#" $dst, $src0", pattern >; class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK < - op, (outs SReg_64:$dst), (ins i16imm:$src0), + op, (outs SReg_64:$dst), (ins u16imm:$src0), opName#" $dst, $src0", pattern >; -multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass, +//===----------------------------------------------------------------------===// +// SMRD classes +//===----------------------------------------------------------------------===// + +class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SMRD <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} + +class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMRDe <op, imm>, + SIMCInstr<opName, SISubtarget.SI>; + +multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm, list<dag> pattern> { + + def "" : SMRD_Pseudo <opName, outs, ins, pattern>; + + def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + +} + +multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, RegisterClass dstClass> { - def _IMM : SMRD < - op, 1, (outs dstClass:$dst), + defm _IMM : SMRD_m < + op, opName#"_IMM", 1, (outs dstClass:$dst), (ins baseClass:$sbase, u32imm:$offset), - asm#" $dst, $sbase, $offset", [] + opName#" $dst, $sbase, $offset", [] >; - def _SGPR : SMRD < - op, 0, (outs dstClass:$dst), + defm _SGPR : SMRD_m < + op, opName#"_SGPR", 0, (outs dstClass:$dst), (ins baseClass:$sbase, SReg_32:$soff), - asm#" $dst, $sbase, $soff", [] + opName#" $dst, $sbase, $soff", [] >; } @@ -255,6 +426,197 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass, // Vector ALU classes //===----------------------------------------------------------------------===// +// This must always be right before the operand being input modified. +def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printOperandAndMods"; +} +def InputModsNoDefault : Operand <i32> { + let PrintMethod = "printOperandAndMods"; +} + +class getNumSrcArgs<ValueType Src1, ValueType Src2> { + int ret = + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 + 3)); // VOP3 +} + +// Returns the register class to use for the destination of VOP[123C] +// instructions for the given VT. +class getVALUDstForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64); +} + +// Returns the register class to use for source 0 of VOP[12C] +// instructions for the given VT. +class getVOPSrc0ForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); +} + +// Returns the register class to use for source 1 of VOP[12C] for the +// given VT. +class getVOPSrc1ForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64); +} + +// Returns the register classes for the source arguments of a VOP[12C] +// instruction for the given SrcVTs. +class getInRC32 <list<ValueType> SrcVT> { + list<RegisterClass> ret = [ + getVOPSrc0ForVT<SrcVT[0]>.ret, + getVOPSrc1ForVT<SrcVT[1]>.ret + ]; +} + +// Returns the register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3SrcForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); +} + +// Returns the register classes for the source arguments of a VOP3 +// instruction for the given SrcVTs. +class getInRC64 <list<ValueType> SrcVT> { + list<RegisterClass> ret = [ + getVOP3SrcForVT<SrcVT[0]>.ret, + getVOP3SrcForVT<SrcVT[1]>.ret, + getVOP3SrcForVT<SrcVT[2]>.ret + ]; +} + +// Returns 1 if the source arguments have modifiers, 0 if they do not. +class hasModifiers<ValueType SrcVT> { + bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, 0)); +} + +// Returns the input arguments for VOP[12C] instructions for the given SrcVT. +class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> { + dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 + !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 + (ins))); +} + +// Returns the input arguments for VOP3 instructions for the given SrcVT. +class getIns64 <RegisterClass Src0RC, RegisterClass Src1RC, + RegisterClass Src2RC, int NumSrcArgs, + bit HasModifiers> { + + dag ret = + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP1 without modifiers + (ins Src0RC:$src0) + /* endif */ ), + !if (!eq(NumSrcArgs, 2), + !if (!eq(HasModifiers, 1), + // VOP 2 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP2 without modifiers + (ins Src0RC:$src0, Src1RC:$src1) + /* endif */ ) + /* NumSrcArgs == 3 */, + !if (!eq(HasModifiers, 1), + // VOP3 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + InputModsNoDefault:$src2_modifiers, Src2RC:$src2, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP3 without modifiers + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) + /* endif */ ))); +} + +// Returns the assembly string for the inputs and outputs of a VOP[12C] +// instruction. This does not add the _e32 suffix, so it can be reused +// by getAsm64. +class getAsm32 <int NumSrcArgs> { + string src1 = ", $src1"; + string src2 = ", $src2"; + string ret = " $dst, $src0"# + !if(!eq(NumSrcArgs, 1), "", src1)# + !if(!eq(NumSrcArgs, 3), src2, ""); +} + +// Returns the assembly string for the inputs and outputs of a VOP3 +// instruction. +class getAsm64 <int NumSrcArgs, bit HasModifiers> { + string src0 = "$src0_modifiers,"; + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string ret = + !if(!eq(HasModifiers, 0), + getAsm32<NumSrcArgs>.ret, + " $dst, "#src0#src1#src2#"$clamp"#"$omod"); +} + + +class VOPProfile <list<ValueType> _ArgVT> { + + field list<ValueType> ArgVT = _ArgVT; + + field ValueType DstVT = ArgVT[0]; + field ValueType Src0VT = ArgVT[1]; + field ValueType Src1VT = ArgVT[2]; + field ValueType Src2VT = ArgVT[3]; + field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterClass Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; + field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; + field RegisterClass Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; + field RegisterClass Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; + field RegisterClass Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; + + field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; + field bit HasModifiers = hasModifiers<Src0VT>.ret; + + field dag Outs = (outs DstRC:$dst); + + field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; + field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasModifiers>.ret; + + field string Asm32 = "_e32"#getAsm32<NumSrcArgs>.ret; + field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; +} + +def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; +def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; +def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; +def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; +def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; +def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; + +def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; +def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; +def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + let Src0RC32 = VCSrc_32; +} +def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; + +def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; +def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; +def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; + + class VOP <string opName> { string OpName = opName; } @@ -264,197 +626,310 @@ class VOP2_REV <string revOp, bit isOrig> { bit IsOrig = isOrig; } -class SIMCInstr <string pseudo, int subtarget> { - string PseudoInstr = pseudo; - int Subtarget = subtarget; +class AtomicNoRet <string noRetOp, bit isRet> { + string NoRetOp = noRetOp; + bit IsRet = isRet; +} + +class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP1Common <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; } -multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern, +multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { + def "" : VOP1_Pseudo <outs, ins, pattern, opName>; - def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr<OpName, SISubtarget.NONE> { - let isPseudo = 1; - } + def _si : VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName, SISubtarget.SI>; +} - def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>; +class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { + bits<2> src0_modifiers = !if(HasModifiers, ?, 0); + bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0); + bits<2> omod = !if(HasModifiers, ?, 0); + bits<1> clamp = !if(HasModifiers, ?, 0); + bits<9> src1 = !if(HasSrc1, ?, 0); + bits<9> src2 = !if(HasSrc2, ?, 0); } -// This must always be right before the operand being input modified. -def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { - let PrintMethod = "printOperandAndMods"; +class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP3Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; } -multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src, - string opName, list<dag> pattern> { +class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3 <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.SI>; - def _e32 : VOP1 < - op, (outs drc:$dst), (ins src:$src0), - opName#"_e32 $dst, $src0", pattern - >, VOP <opName>; +multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; - def _e64 : VOP3 < - {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - (outs drc:$dst), - (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [] - >, VOP <opName> { - let src1 = SIOperand.ZERO; - let src2 = SIOperand.ZERO; - } } -multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>; +multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { -multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>; + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; -multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>; + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; +} -multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>; +multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { -multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, - string opName, list<dag> pattern, string revOp> { - def _e32 : VOP2 < - op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), - opName#"_e32 $dst, $src0, $src1", pattern - >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - def _e64 : VOP3 < - {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - (outs vrc:$dst), - (ins InputMods:$src0_modifiers, arc:$src0, - InputMods:$src1_modifiers, arc:$src1, - i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] - >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { - let src2 = SIOperand.ZERO; - } + def _si : VOP3_Real_si <op.SI3, + outs, ins, asm, opName>, + VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>, + VOP3DisableFields<1, 0, HasMods>; } -multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern, - string revOp = opName> - : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>; - -multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern, - string revOp = opName> - : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>; - -multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern, - RegisterClass src0_rc, string revOp = opName> { - - def _e32 : VOP2 < - op, (outs VReg_32:$dst), (ins src0_rc:$src0, VReg_32:$src1), - opName#"_e32 $dst, $src0, $src1", pattern - >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _e64 : VOP3b < - {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - (outs VReg_32:$dst), - (ins InputMods: $src0_modifiers, VSrc_32:$src0, - InputMods:$src1_modifiers, VSrc_32:$src1, - i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] - >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { - let src2 = SIOperand.ZERO; - /* the VOP2 variant puts the carry out into VCC, the VOP3 variant - can write it into any SGPR. We currently don't use the carry out, - so for now hardcode it to VCC as well */ - let sdst = SIOperand.VCC; - } +multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + // The VOP2 variant puts the carry out into VCC, the VOP3 variant + // can write it into any SGPR. We currently don't use the carry out, + // so for now hardcode it to VCC as well. + let sdst = SIOperand.VCC, Defs = [VCC] in { + def _si : VOP3b <op.SI3, outs, ins, asm, pattern>, + VOP3DisableFields<1, 0, HasMods>, + SIMCInstr<opName, SISubtarget.SI>, + VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>; + } // End sdst = SIOperand.VCC, Defs = [VCC] } -multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, - string opName, ValueType vt, PatLeaf cond, bit defExec = 0> { - def _e32 : VOPC < - op, (ins arc:$src0, vrc:$src1), - opName#"_e32 $dst, $src0, $src1", [] - >, VOP <opName> { - let Defs = !if(defExec, [VCC, EXEC], [VCC]); - } +multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, + bit HasMods, bit defExec> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - def _e64 : VOP3 < - {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - (outs SReg_64:$dst), - (ins InputMods:$src0_modifiers, arc:$src0, - InputMods:$src1_modifiers, arc:$src1, - InstFlag:$clamp, InstFlag:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", - !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>, - [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))] - ) - >, VOP <opName> { + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); - let src2 = SIOperand.ZERO; - let src2_modifiers = 0; } } -multiclass VOPC_32 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>; +multiclass VOP1_Helper <vop1 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + bit HasMods> { -multiclass VOPC_64 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>; + def _e32 : VOP1 <op.SI, outs, ins32, opName#asm32, pat32>, VOP<opName>; -multiclass VOPCX_32 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>; + defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>; +} + +multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP1_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + P.HasModifiers +>; + +class VOP2_e32 <bits<6> op, string opName, dag outs, dag ins, string asm, + list<dag> pattern, string revOp> : + VOP2 <op, outs, ins, opName#asm, pattern>, + VOP <opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; -multiclass VOPCX_64 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>; +multiclass VOP2_Helper <vop2 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { + def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>; -multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < - op, (outs VReg_32:$dst), - (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers, - VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2, - InstFlag:$clamp, InstFlag:$omod), - opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName + defm _e64 : VOP3_2_m <op, + outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + >; +} + +multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers >; -class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 < - op, (outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->, VOP <opName> { +multiclass VOP2b_Helper <vop2 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { - let src2 = SIOperand.ZERO; - let src0_modifiers = 0; - let clamp = 0; - let omod = 0; + def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>; + + defm _e64 : VOP3b_2_m <op, + outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + >; } -class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < - op, (outs VReg_64:$dst), - (ins InputMods:$src0_modifiers, VSrc_64:$src0, - InputMods:$src1_modifiers, VSrc_64:$src1, - InputMods:$src2_modifiers, VSrc_64:$src2, - InstFlag:$clamp, InstFlag:$omod), - opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern ->, VOP <opName>; +multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2b_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +multiclass VOPC_Helper <vopc op, string opName, + dag ins32, string asm32, list<dag> pat32, + dag out64, dag ins64, string asm64, list<dag> pat64, + bit HasMods, bit DefExec> { + def _e32 : VOPC <op.SI, ins32, opName#asm32, pat32>, VOP <opName> { + let Defs = !if(DefExec, [EXEC], []); + } + + defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, opName, + HasMods, DefExec>; +} + +multiclass VOPCInst <vopc op, string opName, + VOPProfile P, PatLeaf cond = COND_NULL, + bit DefExec = 0> : VOPC_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs SReg_64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), + P.HasModifiers, DefExec +>; +multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_F32_F32_F32, cond>; -class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc, - string opName, list<dag> pattern> : VOP3 < - op, (outs vrc:$dst0, SReg_64:$dst1), - (ins arc:$src0, arc:$src1, arc:$src2, - InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern ->, VOP <opName>; +multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_F64_F64_F64, cond>; +multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_I32_I32_I32, cond>; -class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> : +multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_I64_I64_I64, cond>; + + +multiclass VOPCX <vopc op, string opName, VOPProfile P, + PatLeaf cond = COND_NULL> + : VOPCInst <op, opName, P, cond, 1>; + +multiclass VOPCX_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_F32_F32_F32, cond>; + +multiclass VOPCX_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_F64_F64_F64, cond>; + +multiclass VOPCX_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_I32_I32_I32, cond>; + +multiclass VOPCX_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_I64_I64_I64, cond>; + +multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, + list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < + op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods +>; + +multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, P.Outs, P.Ins64, P.Asm64, + !if(!eq(P.NumSrcArgs, 3), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + P.Src2VT:$src2))]), + !if(!eq(P.NumSrcArgs, 2), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + /* P.NumSrcArgs == 1 */, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers +>; + +multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> : + VOP3b_2_m < + op, (outs vrc:$vdst, SReg_64:$sdst), + (ins InputModsNoDefault:$src0_modifiers, arc:$src0, + InputModsNoDefault:$src1_modifiers, arc:$src1, + InputModsNoDefault:$src2_modifiers, arc:$src2, + ClampMod:$clamp, omod:$omod), + opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, + opName, opName, 1, 1 +>; + +multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; -class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> : +multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>; + +class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), + (Inst i32:$src0_modifiers, P.Src0VT:$src0, + i32:$src1_modifiers, P.Src1VT:$src1, + i32:$src2_modifiers, P.Src2VT:$src2, + i1:$clamp, + i32:$omod)>; + //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// @@ -466,13 +941,15 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> : // Single load interpret the 2 i8imm operands as a single i16 offset. let offset0 = offset{7-0}; let offset1 = offset{15-8}; + + let hasSideEffects = 0; } class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < op, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset), - asm#" $vdst, $addr, $offset, [M0]", + (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset), + asm#" $vdst, $addr"#"$offset"#" [M0]", []> { let data0 = 0; let data1 = 0; @@ -483,20 +960,21 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < op, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1), - asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]", + (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1), + asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]", []> { let data0 = 0; let data1 = 0; let mayLoad = 1; let mayStore = 0; + let hasSideEffects = 0; } class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset), - asm#" $addr, $data0, $offset [M0]", + (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset), + asm#" $addr, $data0"#"$offset"#" [M0]", []> { let data1 = 0; let mayStore = 1; @@ -504,76 +982,204 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < let vdst = 0; } -class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < +class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1), - asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]", + (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1, + ds_offset0:$offset0, ds_offset1:$offset1), + asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]", []> { let mayStore = 1; let mayLoad = 0; + let hasSideEffects = 0; let vdst = 0; } // 1 address, 1 data. -class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), - asm#" $vdst, $addr, $data0, $offset, [M0]", - []> { + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset), + asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>, + AtomicNoRet<noRetOp, 1> { let data1 = 0; let mayStore = 1; let mayLoad = 1; + + let hasPostISelHook = 1; // Adjusted to no return version. } // 1 address, 2 data. -class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), - asm#" $vdst, $addr, $data0, $data1, $offset, [M0]", - []> { + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset), + asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]", + []>, + AtomicNoRet<noRetOp, 1> { let mayStore = 1; let mayLoad = 1; + let hasPostISelHook = 1; // Adjusted to no return version. } // 1 address, 2 data. -class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), - asm#" $addr, $data0, $data1, $offset, [M0]", - []> { + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset), + asm#" $addr, $data0, $data1"#"$offset"#" [M0]", + []>, + AtomicNoRet<noRetOp, 0> { let mayStore = 1; let mayLoad = 1; } // 1 address, 1 data. -class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), - asm#" $addr, $data0, $offset, [M0]", - []> { + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset), + asm#" $addr, $data0"#"$offset"#" [M0]", + []>, + AtomicNoRet<noRetOp, 0> { let data1 = 0; let mayStore = 1; let mayLoad = 1; } -class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < - op, - (outs), +//===----------------------------------------------------------------------===// +// MTBUF classes +//===----------------------------------------------------------------------===// + +class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MTBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} + +class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, + string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MTBUF_Pseudo <opName, outs, ins, pattern>; + + def _si : MTBUF_Real_si <op, opName, outs, ins, asm>; + +} + +let mayStore = 1, mayLoad = 0 in { + +multiclass MTBUF_Store_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs), (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), - asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", - []> { - let mayStore = 1; - let mayLoad = 0; + opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayStore = 1, mayLoad = 0 + +let mayLoad = 1, mayStore = 0 in { + +multiclass MTBUF_Load_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs regClass:$dst), + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), + opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayLoad = 1, mayStore = 0 + +class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { + + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + +class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> + : MUBUF <op, outs, ins, asm, pattern> { + + let offen = 0; + let idxen = 0; + let addr64 = 1; + let tfe = 0; + let lds = 0; + let soffset = 128; +} + +class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> + : MUBUF <op, outs, ins, asm, pattern> { + + let offen = 0; + let idxen = 0; + let addr64 = 0; + let tfe = 0; + let lds = 0; + let vaddr = 0; +} + +multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc, + ValueType vt, SDPatternOperator atomic> { + + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + + // No return variants + let glc = 0 in { + + def _ADDR64 : MUBUFAtomicAddr64 < + op, (outs), + (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", [] + >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>; + + def _OFFSET : MUBUFAtomicOffset < + op, (outs), + (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, + SSrc_32:$soffset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [] + >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>; + } // glc = 0 + + // Variant that return values + let glc = 1, Constraints = "$vdata = $vdata_in", + DisableEncoding = "$vdata_in" in { + + def _RTN_ADDR64 : MUBUFAtomicAddr64 < + op, (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc", + [(set vt:$vdata, + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset, + i1:$slc), vt:$vdata_in))] + >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>; + + def _RTN_OFFSET : MUBUFAtomicOffset < + op, (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset, + SSrc_32:$soffset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + [(set vt:$vdata, + (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), vt:$vdata_in))] + >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>; + + } // glc = 1 + + } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, @@ -584,81 +1190,137 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, let addr64 = 0 in { - let offen = 0, idxen = 0 in { + let offen = 0, idxen = 0, vaddr = 0 in { def _OFFSET : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_32:$vaddr, - u16imm:$offset, SSrc_32:$soffset, i1imm:$glc, - i1imm:$slc, i1imm:$tfe), - asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + (ins SReg_128:$srsrc, + mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe)))]>, + MUBUFAddr64Table<0>; } - let offen = 1, idxen = 0, offset = 0 in { + let offen = 1, idxen = 0 in { def _OFFEN : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_32:$vaddr, - SSrc_32:$soffset, i1imm:$glc, i1imm:$slc, - i1imm:$tfe), - asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + SSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 0, idxen = 1 in { def _IDXEN : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_32:$vaddr, - u16imm:$offset, SSrc_32:$soffset, i1imm:$glc, - i1imm:$slc, i1imm:$tfe), - asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 1, idxen = 1 in { def _BOTHEN : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_64:$vaddr, - SSrc_32:$soffset, i1imm:$glc, - i1imm:$slc, i1imm:$tfe), - asm#" $vdata, $srsrc[$vaddr[0]] + $vaddr[1] + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; } } let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { def _ADDR64 : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), - asm#" $vdata, $srsrc + $vaddr + $offset", + (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), + asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, - i64:$vaddr, u16imm:$offset)))]>; + i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>; } } } -class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, - ValueType store_vt, SDPatternOperator st> : - MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - u16imm:$offset), - name#" $vdata, $srsrc + $vaddr + $offset", - [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> { +multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, + ValueType store_vt, SDPatternOperator st> { + + let addr64 = 0, lds = 0 in { + + def "" : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, + mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# + "$glc"#"$slc"#"$tfe", + [] + >; + + let offen = 0, idxen = 0, vaddr = 0 in { + def _OFFSET : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, + SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, + i1:$tfe))] + >, MUBUFAddr64Table<0>; + } // offen = 0, idxen = 0, vaddr = 0 + + let offen = 1, idxen = 0 in { + def _OFFEN : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# + "$glc"#"$slc"#"$tfe", + [] + >; + } // end offen = 1, idxen = 0 + + } // End addr64 = 0, lds = 0 + + def _ADDR64 : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1> + { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let offen = 0; + let idxen = 0; + let glc = 0; + let addr64 = 1; + let lds = 0; + let slc = 0; + let tfe = 0; + let soffset = 128; // ZERO + } +} + +class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : + FLAT <op, (outs regClass:$data), + (ins VReg_64:$addr), + asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> { + let glc = 0; + let slc = 0; + let tfe = 0; + let mayLoad = 1; +} + +class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : + FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr), + name#" $data, $addr, [M0, FLAT_SCRATCH]", + []> { let mayLoad = 0; let mayStore = 1; // Encoding - let offen = 0; - let idxen = 0; let glc = 0; - let addr64 = 1; - let lds = 0; let slc = 0; let tfe = 0; - let soffset = 128; // ZERO -} - -class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < - op, - (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), - asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", - []> { - let mayLoad = 1; - let mayStore = 0; } class MIMG_Mask <string op, int channels> { @@ -799,6 +1461,15 @@ def getVOPe64 : InstrMapping { let ValueCols = [["8"]]; } +// Maps an opcode in e64 form to its e32 equivalent +def getVOPe32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["4"]]; +} + // Maps an original opcode to its commuted version def getCommuteRev : InstrMapping { let FilterClass = "VOP2_REV"; @@ -841,4 +1512,30 @@ def getMCOpcode : InstrMapping { let ValueCols = [[!cast<string>(SISubtarget.SI)]]; } +def getAddr64Inst : InstrMapping { + let FilterClass = "MUBUFAddr64Table"; + let RowFields = ["OpName"]; + let ColFields = ["IsAddr64"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its version with a return value. +def getAtomicRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its returnless version. +def getAtomicNoRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + include "SIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index b3b44e2..90da7a9 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -31,13 +31,25 @@ def isSI : Predicate<"Subtarget.getGeneration() " def isCI : Predicate<"Subtarget.getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; +def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; -def isCFDepth0 : Predicate<"isCFDepth0()">; +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} -def WAIT_FLAG : InstFlag<"printWaitFlag">; +def WAIT_FLAG : InstFlag<"printWaitFlag"> { + let ParserMatchClass = SWaitMatchClass; +} let SubtargetPredicate = isSI in { -let OtherPredicates = [isCFDepth0] in { + +//===----------------------------------------------------------------------===// +// EXP Instructions +//===----------------------------------------------------------------------===// + +defm EXP : EXP_m; //===----------------------------------------------------------------------===// // SMRD Instructions @@ -48,125 +60,126 @@ let mayLoad = 1 in { // We are using the SGPR_32 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SGPR_32 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>; +defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32 + 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64 + 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 >; defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128 + 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 >; defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256 + 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 >; defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512 + 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; } // mayLoad = 1 -//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; +//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; //===----------------------------------------------------------------------===// // SOP1 Instructions //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { - let isMoveImm = 1 in { -def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>; -def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>; -def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>; -def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>; +def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>; +def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>; +def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>; +def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>; } // End isMoveImm = 1 -def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", +def S_NOT_B32 : SOP1_32 <0x00000007, "s_not_b32", [(set i32:$dst, (not i32:$src0))] >; -def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", +def S_NOT_B64 : SOP1_64 <0x00000008, "s_not_b64", [(set i64:$dst, (not i64:$src0))] >; -def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; -def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; -def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", +def S_WQM_B32 : SOP1_32 <0x00000009, "s_wqm_b32", []>; +def S_WQM_B64 : SOP1_64 <0x0000000a, "s_wqm_b64", []>; +def S_BREV_B32 : SOP1_32 <0x0000000b, "s_brev_b32", [(set i32:$dst, (AMDGPUbrev i32:$src0))] >; -def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; -} // End neverHasSideEffects = 1 +def S_BREV_B64 : SOP1_64 <0x0000000c, "s_brev_b64", []>; -////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; -////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; -def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32", +////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "s_bcnt0_i32_b32", []>; +////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "s_bcnt0_i32_b64", []>; +def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "s_bcnt1_i32_b32", [(set i32:$dst, (ctpop i32:$src0))] >; -def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>; +def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "s_bcnt1_i32_b64", []>; -////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>; -////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; -def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32", +////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "s_ff0_i32_b32", []>; +////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "s_ff0_i32_b64", []>; +def S_FF1_I32_B32 : SOP1_32 <0x00000013, "s_ff1_i32_b32", [(set i32:$dst, (cttz_zero_undef i32:$src0))] >; -////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; +////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "s_ff1_i32_b64", []>; -def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", +def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "s_flbit_i32_b32", [(set i32:$dst, (ctlz_zero_undef i32:$src0))] >; -//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; -def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; -//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; -def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", +//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "s_flbit_i32_b64", []>; +def S_FLBIT_I32 : SOP1_32 <0x00000017, "s_flbit_i32", []>; +//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "s_flbit_i32_i64", []>; +def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "s_sext_i32_i8", [(set i32:$dst, (sext_inreg i32:$src0, i8))] >; -def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", +def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "s_sext_i32_i16", [(set i32:$dst, (sext_inreg i32:$src0, i16))] >; -////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>; -////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; -////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; -////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>; -def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>; -def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>; -def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>; -def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>; +////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "s_bitset0_b32", []>; +////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "s_bitset0_b64", []>; +////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "s_bitset1_b32", []>; +////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "s_bitset1_b64", []>; +def S_GETPC_B64 : SOP1 < + 0x0000001f, (outs SReg_64:$dst), (ins), "s_getpc_b64 $dst", [] +> { + let SSRC0 = 0; +} +def S_SETPC_B64 : SOP1_64 <0x00000020, "s_setpc_b64", []>; +def S_SWAPPC_B64 : SOP1_64 <0x00000021, "s_swappc_b64", []>; +def S_RFE_B64 : SOP1_64 <0x00000022, "s_rfe_b64", []>; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in { -def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>; -def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>; -def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>; -def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>; -def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>; -def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>; -def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>; -def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>; +def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "s_and_saveexec_b64", []>; +def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "s_or_saveexec_b64", []>; +def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "s_xor_saveexec_b64", []>; +def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "s_andn2_saveexec_b64", []>; +def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "s_orn2_saveexec_b64", []>; +def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "s_nand_saveexec_b64", []>; +def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "s_nor_saveexec_b64", []>; +def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "s_xnor_saveexec_b64", []>; } // End hasSideEffects = 1 -def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>; -def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>; -def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>; -def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>; -def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>; -def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>; -//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>; -def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>; -def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>; -def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; +def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "s_quadmask_b32", []>; +def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "s_quadmask_b64", []>; +def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "s_movrels_b32", []>; +def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "s_movrels_b64", []>; +def S_MOVRELD_B32 : SOP1_32 <0x00000030, "s_movreld_b32", []>; +def S_MOVRELD_B64 : SOP1_64 <0x00000031, "s_movreld_b64", []>; +//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "s_cbranch_join", []>; +def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "s_mov_regrd_b32", []>; +def S_ABS_I32 : SOP1_32 <0x00000034, "s_abs_i32", []>; +def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>; //===----------------------------------------------------------------------===// // SOP2 Instructions @@ -174,145 +187,150 @@ def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { -def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; -def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", +def S_ADD_U32 : SOP2_32 <0x00000000, "s_add_u32", []>; +def S_ADD_I32 : SOP2_32 <0x00000002, "s_add_i32", [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] >; } // End isCommutable = 1 -def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; -def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", +def S_SUB_U32 : SOP2_32 <0x00000001, "s_sub_u32", []>; +def S_SUB_I32 : SOP2_32 <0x00000003, "s_sub_i32", [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { -def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", +def S_ADDC_U32 : SOP2_32 <0x00000004, "s_addc_u32", [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End isCommutable = 1 -def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", +def S_SUBB_U32 : SOP2_32 <0x00000005, "s_subb_u32", [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End Uses = [SCC] } // End Defs = [SCC] -def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", +def S_MIN_I32 : SOP2_32 <0x00000006, "s_min_i32", [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] >; -def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", +def S_MIN_U32 : SOP2_32 <0x00000007, "s_min_u32", [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] >; -def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", +def S_MAX_I32 : SOP2_32 <0x00000008, "s_max_i32", [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] >; -def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", +def S_MAX_U32 : SOP2_32 <0x00000009, "s_max_u32", [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] >; -def S_CSELECT_B32 : SOP2 < - 0x0000000a, (outs SReg_32:$dst), - (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", +def S_CSELECT_B32 : SOP2_SELECT_32 < + 0x0000000a, "s_cselect_b32", [] >; -def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; +def S_CSELECT_B64 : SOP2_64 <0x0000000b, "s_cselect_b64", []>; -def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", +def S_AND_B32 : SOP2_32 <0x0000000e, "s_and_b32", [(set i32:$dst, (and i32:$src0, i32:$src1))] >; -def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", +def S_AND_B64 : SOP2_64 <0x0000000f, "s_and_b64", [(set i64:$dst, (and i64:$src0, i64:$src1))] >; -def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", +def S_OR_B32 : SOP2_32 <0x00000010, "s_or_b32", [(set i32:$dst, (or i32:$src0, i32:$src1))] >; -def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", +def S_OR_B64 : SOP2_64 <0x00000011, "s_or_b64", [(set i64:$dst, (or i64:$src0, i64:$src1))] >; -def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", +def S_XOR_B32 : SOP2_32 <0x00000012, "s_xor_b32", [(set i32:$dst, (xor i32:$src0, i32:$src1))] >; -def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", +def S_XOR_B64 : SOP2_64 <0x00000013, "s_xor_b64", [(set i64:$dst, (xor i64:$src0, i64:$src1))] >; -def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; -def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>; -def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>; -def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>; -def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; -def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; -def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; -def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; -def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; -def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; +def S_ANDN2_B32 : SOP2_32 <0x00000014, "s_andn2_b32", []>; +def S_ANDN2_B64 : SOP2_64 <0x00000015, "s_andn2_b64", []>; +def S_ORN2_B32 : SOP2_32 <0x00000016, "s_orn2_b32", []>; +def S_ORN2_B64 : SOP2_64 <0x00000017, "s_orn2_b64", []>; +def S_NAND_B32 : SOP2_32 <0x00000018, "s_nand_b32", []>; +def S_NAND_B64 : SOP2_64 <0x00000019, "s_nand_b64", []>; +def S_NOR_B32 : SOP2_32 <0x0000001a, "s_nor_b32", []>; +def S_NOR_B64 : SOP2_64 <0x0000001b, "s_nor_b64", []>; +def S_XNOR_B32 : SOP2_32 <0x0000001c, "s_xnor_b32", []>; +def S_XNOR_B64 : SOP2_64 <0x0000001d, "s_xnor_b64", []>; // Use added complexity so these patterns are preferred to the VALU patterns. let AddedComplexity = 1 in { -def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", +def S_LSHL_B32 : SOP2_32 <0x0000001e, "s_lshl_b32", [(set i32:$dst, (shl i32:$src0, i32:$src1))] >; -def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64", +def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "s_lshl_b64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] >; -def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", +def S_LSHR_B32 : SOP2_32 <0x00000020, "s_lshr_b32", [(set i32:$dst, (srl i32:$src0, i32:$src1))] >; -def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64", +def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "s_lshr_b64", [(set i64:$dst, (srl i64:$src0, i32:$src1))] >; -def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", +def S_ASHR_I32 : SOP2_32 <0x00000022, "s_ashr_i32", [(set i32:$dst, (sra i32:$src0, i32:$src1))] >; -def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64", +def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "s_ashr_i64", [(set i64:$dst, (sra i64:$src0, i32:$src1))] >; + +def S_BFM_B32 : SOP2_32 <0x00000024, "s_bfm_b32", []>; +def S_BFM_B64 : SOP2_64 <0x00000025, "s_bfm_b64", []>; +def S_MUL_I32 : SOP2_32 <0x00000026, "s_mul_i32", + [(set i32:$dst, (mul i32:$src0, i32:$src1))] +>; + } // End AddedComplexity = 1 -def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; -def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; -def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; -def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; -def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; -def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; -def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; -//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; -def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; +def S_BFE_U32 : SOP2_32 <0x00000027, "s_bfe_u32", []>; +def S_BFE_I32 : SOP2_32 <0x00000028, "s_bfe_i32", []>; +def S_BFE_U64 : SOP2_64 <0x00000029, "s_bfe_u64", []>; +def S_BFE_I64 : SOP2_64_32 <0x0000002a, "s_bfe_i64", []>; +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "s_cbranch_g_fork", []>; +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "s_absdiff_i32", []>; //===----------------------------------------------------------------------===// // SOPC Instructions //===----------------------------------------------------------------------===// -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; //===----------------------------------------------------------------------===// // SOPK Instructions //===----------------------------------------------------------------------===// -def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>; -def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>; +let isReMaterializable = 1 in { +def S_MOVK_I32 : SOPK_32 <0x00000000, "s_movk_i32", []>; +} // End isReMaterializable = 1 +def S_CMOVK_I32 : SOPK_32 <0x00000002, "s_cmovk_i32", []>; /* This instruction is disabled for now until we can figure out how to teach @@ -328,94 +346,87 @@ VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 def S_CMPK_EQ_I32 : SOPK < 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), - "S_CMPK_EQ_I32", + "s_cmpk_eq_i32", [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] >; */ let isCompare = 1, Defs = [SCC] in { -def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; -def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; -def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; -def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>; -def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>; -def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>; -def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>; -def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; -def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; -def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; -def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; +def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "s_cmpk_lg_i32", []>; +def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "s_cmpk_gt_i32", []>; +def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "s_cmpk_ge_i32", []>; +def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "s_cmpk_lt_i32", []>; +def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "s_cmpk_le_i32", []>; +def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "s_cmpk_eq_u32", []>; +def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "s_cmpk_lg_u32", []>; +def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "s_cmpk_gt_u32", []>; +def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "s_cmpk_ge_u32", []>; +def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "s_cmpk_lt_u32", []>; +def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "s_cmpk_le_u32", []>; } // End isCompare = 1, Defs = [SCC] let Defs = [SCC], isCommutable = 1 in { - def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; - def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>; + def S_ADDK_I32 : SOPK_32 <0x0000000f, "s_addk_i32", []>; + def S_MULK_I32 : SOPK_32 <0x00000010, "s_mulk_i32", []>; } -//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>; -def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>; -def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>; -def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; -//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>; -//def EXP : EXP_ <0x00000000, "EXP", []>; - -} // End let OtherPredicates = [isCFDepth0] +//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "s_cbranch_i_fork", []>; +def S_GETREG_B32 : SOPK_32 <0x00000012, "s_getreg_b32", []>; +def S_SETREG_B32 : SOPK_32 <0x00000013, "s_setreg_b32", []>; +def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "s_getreg_regrd_b32", []>; +//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "s_setreg_imm32_b32", []>; +//def EXP : EXP_ <0x00000000, "exp", []>; //===----------------------------------------------------------------------===// // SOPP Instructions //===----------------------------------------------------------------------===// -def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>; +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; let isTerminator = 1 in { -def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", +def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", [(IL_retflag)]> { - let SIMM16 = 0; + let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; } let isBranch = 1 in { def S_BRANCH : SOPP < - 0x00000002, (ins brtarget:$target), "S_BRANCH $target", - [(br bb:$target)]> { + 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", + [(br bb:$simm16)]> { let isBarrier = 1; } let DisableEncoding = "$scc" in { def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins brtarget:$target, SCCReg:$scc), - "S_CBRANCH_SCC0 $target", [] + 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "s_cbranch_scc0 $simm16" >; def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins brtarget:$target, SCCReg:$scc), - "S_CBRANCH_SCC1 $target", - [] + 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "s_cbranch_scc1 $simm16" >; } // End DisableEncoding = "$scc" def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins brtarget:$target, VCCReg:$vcc), - "S_CBRANCH_VCCZ $target", - [] + 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "s_cbranch_vccz $simm16" >; def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins brtarget:$target, VCCReg:$vcc), - "S_CBRANCH_VCCNZ $target", - [] + 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "s_cbranch_vccnz $simm16" >; let DisableEncoding = "$exec" in { def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins brtarget:$target, EXECReg:$exec), - "S_CBRANCH_EXECZ $target", - [] + 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "s_cbranch_execz $simm16" >; def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins brtarget:$target, EXECReg:$exec), - "S_CBRANCH_EXECNZ $target", - [] + 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "s_cbranch_execnz $simm16" >; } // End DisableEncoding = "$exec" @@ -424,37 +435,39 @@ def S_CBRANCH_EXECNZ : SOPP < } // End isTerminator = 1 let hasSideEffects = 1 in { -def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER", +def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_AMDGPU_barrier_local)] > { - let SIMM16 = 0; + let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; let mayLoad = 1; let mayStore = 1; } -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16", - [] ->; -//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; -//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; -//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; let Uses = [EXEC] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16", + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "s_sendmsg $simm16", [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] > { let DisableEncoding = "$m0"; } } // End Uses = [EXEC] -//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; -//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; -//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; -//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; -//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; -//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; +def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { + let simm16 = 0; +} +def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; +def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; +def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { + let simm16 = 0; +} } // End hasSideEffects //===----------------------------------------------------------------------===// @@ -463,256 +476,256 @@ let Uses = [EXEC] in { let isCompare = 1 in { -defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">; -defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>; -defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>; -defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>; -defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">; -defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>; -defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>; -defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>; -defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">; -defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">; -defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">; -defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">; -defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>; -defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">; -defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">; +defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0>, "v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1>, "v_cmp_lt_f32", COND_OLT>; +defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2>, "v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3>, "v_cmp_le_f32", COND_OLE>; +defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4>, "v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5>, "v_cmp_lg_f32">; +defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6>, "v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7>, "v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8>, "v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9>, "v_cmp_nge_f32">; +defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa>, "v_cmp_nlg_f32">; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb>, "v_cmp_ngt_f32">; +defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc>, "v_cmp_nle_f32">; +defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd>, "v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe>, "v_cmp_nlt_f32">; +defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf>, "v_cmp_tru_f32">; let hasSideEffects = 1 in { -defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">; -defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">; -defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">; -defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">; -defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">; -defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">; -defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">; -defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">; -defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">; -defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">; -defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">; -defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">; -defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">; -defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">; -defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">; -defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">; +defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10>, "v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11>, "v_cmpx_lt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12>, "v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13>, "v_cmpx_le_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14>, "v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15>, "v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16>, "v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17>, "v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18>, "v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19>, "v_cmpx_nge_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a>, "v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b>, "v_cmpx_ngt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c>, "v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d>, "v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e>, "v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f>, "v_cmpx_tru_f32">; } // End hasSideEffects = 1 -defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">; -defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>; -defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>; -defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>; -defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">; -defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>; -defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>; -defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>; -defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">; -defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">; -defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">; -defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">; -defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>; -defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">; -defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">; +defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20>, "v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21>, "v_cmp_lt_f64", COND_OLT>; +defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22>, "v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23>, "v_cmp_le_f64", COND_OLE>; +defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24>, "v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25>, "v_cmp_lg_f64">; +defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26>, "v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27>, "v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28>, "v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29>, "v_cmp_nge_f64">; +defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a>, "v_cmp_nlg_f64">; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b>, "v_cmp_ngt_f64">; +defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c>, "v_cmp_nle_f64">; +defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d>, "v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e>, "v_cmp_nlt_f64">; +defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f>, "v_cmp_tru_f64">; let hasSideEffects = 1 in { -defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">; -defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">; -defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">; -defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">; -defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">; -defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">; -defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">; -defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">; -defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">; -defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">; -defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">; -defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">; -defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">; -defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">; -defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">; -defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">; +defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30>, "v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31>, "v_cmpx_lt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32>, "v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33>, "v_cmpx_le_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34>, "v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35>, "v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36>, "v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37>, "v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38>, "v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39>, "v_cmpx_nge_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a>, "v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b>, "v_cmpx_ngt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c>, "v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d>, "v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e>, "v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f>, "v_cmpx_tru_f64">; } // End hasSideEffects = 1 -defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">; -defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">; -defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">; -defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">; -defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">; -defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">; -defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">; -defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">; -defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">; -defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">; -defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">; -defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">; -defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">; -defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">; -defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">; -defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">; +defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">; +defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32">; +defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; +defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; +defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; +defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; +defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32">; +defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32">; +defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; +defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; +defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; +defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; let hasSideEffects = 1 in { -defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">; -defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">; -defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">; -defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">; -defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">; -defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">; -defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">; -defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">; -defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">; -defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">; -defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">; -defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">; -defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">; -defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">; -defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">; -defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">; +defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32">; +defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32">; +defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; +defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; +defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; +defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; +defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32">; +defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32">; +defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; +defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; +defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; +defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; } // End hasSideEffects = 1 -defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">; -defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">; -defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">; -defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">; -defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">; -defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">; -defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">; -defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">; -defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">; -defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">; -defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">; -defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">; -defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">; -defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">; -defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">; -defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">; +defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64">; +defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64">; +defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; +defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; +defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; +defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; +defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64">; +defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64">; +defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; +defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; +defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; +defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; let hasSideEffects = 1, Defs = [EXEC] in { -defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">; -defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">; -defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">; -defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">; -defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">; -defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">; -defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">; -defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">; -defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">; -defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">; -defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">; -defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">; -defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">; -defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">; -defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">; -defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">; +defm V_CMPSX_F_F64 : VOPC_F64 <vopc<0x70>, "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPC_F64 <vopc<0x71>, "v_cmpsx_lt_f64">; +defm V_CMPSX_EQ_F64 : VOPC_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPC_F64 <vopc<0x73>, "v_cmpsx_le_f64">; +defm V_CMPSX_GT_F64 : VOPC_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPC_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPC_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPC_F64 <vopc<0x77>, "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPC_F64 <vopc<0x78>, "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPC_F64 <vopc<0x79>, "v_cmpsx_nge_f64">; +defm V_CMPSX_NLG_F64 : VOPC_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPC_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64">; +defm V_CMPSX_NLE_F64 : VOPC_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPC_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPC_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; } // End hasSideEffects = 1, Defs = [EXEC] -defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">; -defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>; -defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>; -defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>; -defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>; -defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>; -defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>; -defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">; +defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80>, "v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81>, "v_cmp_lt_i32", COND_SLT>; +defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82>, "v_cmp_eq_i32", COND_EQ>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83>, "v_cmp_le_i32", COND_SLE>; +defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84>, "v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85>, "v_cmp_ne_i32", COND_NE>; +defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86>, "v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87>, "v_cmp_t_i32">; let hasSideEffects = 1 in { -defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">; -defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">; -defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">; -defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">; -defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">; -defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">; -defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">; -defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">; +defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90>, "v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91>, "v_cmpx_lt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92>, "v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93>, "v_cmpx_le_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94>, "v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95>, "v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96>, "v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97>, "v_cmpx_t_i32">; } // End hasSideEffects = 1 -defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">; -defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>; -defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>; -defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>; -defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>; -defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>; -defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>; -defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">; +defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0>, "v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1>, "v_cmp_lt_i64", COND_SLT>; +defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2>, "v_cmp_eq_i64", COND_EQ>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3>, "v_cmp_le_i64", COND_SLE>; +defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4>, "v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5>, "v_cmp_ne_i64", COND_NE>; +defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6>, "v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7>, "v_cmp_t_i64">; let hasSideEffects = 1 in { -defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">; -defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">; -defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">; -defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">; -defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">; -defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">; -defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">; -defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">; +defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0>, "v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1>, "v_cmpx_lt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2>, "v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3>, "v_cmpx_le_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4>, "v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5>, "v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6>, "v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7>, "v_cmpx_t_i64">; } // End hasSideEffects = 1 -defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">; -defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>; -defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>; -defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>; -defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>; -defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>; -defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>; -defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">; +defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0>, "v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1>, "v_cmp_lt_u32", COND_ULT>; +defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2>, "v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3>, "v_cmp_le_u32", COND_ULE>; +defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4>, "v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5>, "v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6>, "v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7>, "v_cmp_t_u32">; let hasSideEffects = 1 in { -defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">; -defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">; -defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">; -defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">; -defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">; -defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">; -defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">; -defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">; +defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0>, "v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1>, "v_cmpx_lt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2>, "v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3>, "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4>, "v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5>, "v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6>, "v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7>, "v_cmpx_t_u32">; } // End hasSideEffects = 1 -defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">; -defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>; -defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>; -defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>; -defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>; -defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>; -defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>; -defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">; +defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0>, "v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1>, "v_cmp_lt_u64", COND_ULT>; +defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2>, "v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3>, "v_cmp_le_u64", COND_ULE>; +defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4>, "v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5>, "v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6>, "v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7>, "v_cmp_t_u64">; let hasSideEffects = 1 in { -defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">; -defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">; -defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">; -defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">; -defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">; -defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">; -defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">; -defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">; +defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0>, "v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1>, "v_cmpx_lt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2>, "v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3>, "v_cmpx_le_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4>, "v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5>, "v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6>, "v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7>, "v_cmpx_t_u64">; } // End hasSideEffects = 1 -defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">; +defm V_CMP_CLASS_F32 : VOPC_F32 <vopc<0x88>, "v_cmp_class_f32">; let hasSideEffects = 1 in { -defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">; +defm V_CMPX_CLASS_F32 : VOPCX_F32 <vopc<0x98>, "v_cmpx_class_f32">; } // End hasSideEffects = 1 -defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">; +defm V_CMP_CLASS_F64 : VOPC_F64 <vopc<0xa8>, "v_cmp_class_f64">; let hasSideEffects = 1 in { -defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">; +defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">; } // End hasSideEffects = 1 } // End isCompare = 1 @@ -722,88 +735,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">; //===----------------------------------------------------------------------===// -def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>; -def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>; -def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>; -def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>; -def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>; -def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>; -def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>; -def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>; -def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>; -def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>; -def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>; -def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>; -def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>; -def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>; -def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>; -def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>; -def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>; - -def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>; -def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>; -def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>; -def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>; -def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>; -def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>; -def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>; -def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>; -def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>; -def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>; -def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>; -def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>; -def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>; -def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>; -//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>; -//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>; -def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>; -def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>; -def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>; -def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>; +def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VReg_32>; +def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VReg_32>; +def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VReg_32>; +def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VReg_32>; +def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VReg_32>; +def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VReg_32>; +def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VReg_32>; +def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VReg_32>; +def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VReg_32>; +def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VReg_32>; +def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VReg_32>; +def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VReg_32>; +def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VReg_32>; +def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VReg_32>; +def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VReg_32>; +def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VReg_32>; +def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VReg_32>; + +def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VReg_32, "ds_add_u32">; +def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VReg_32, "ds_sub_u32">; +def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VReg_32, "ds_rsub_u32">; +def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VReg_32, "ds_inc_u32">; +def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VReg_32, "ds_dec_u32">; +def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VReg_32, "ds_min_i32">; +def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VReg_32, "ds_max_i32">; +def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VReg_32, "ds_min_u32">; +def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VReg_32, "ds_max_u32">; +def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VReg_32, "ds_and_b32">; +def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VReg_32, "ds_or_b32">; +def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VReg_32, "ds_xor_b32">; +def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VReg_32, "ds_mskor_b32">; +def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VReg_32>; +//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2_b32">; +//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2st64_b32">; +def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VReg_32, "ds_cmpst_b32">; +def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VReg_32, "ds_cmpst_f32">; +def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VReg_32, "ds_min_f32">; +def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VReg_32, "ds_max_f32">; let SubtargetPredicate = isCI in { -def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>; +def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VReg_32, "ds_wrap_f32">; } // End isCI -def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>; -def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>; -def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>; -def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>; -def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>; -def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>; -def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>; -def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>; -def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>; -def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>; -def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>; -def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>; -def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>; -def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>; -def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>; -def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>; -def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>; - -def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>; -def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>; -def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>; -def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>; -def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>; -def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>; -def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>; -def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>; -def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>; -def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>; -def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>; -def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>; -def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>; -def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>; -//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>; -//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>; -def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>; -def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>; -def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>; -def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>; +def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">; +//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">; +def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">; +def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">; //let SubtargetPredicate = isCI in { // DS_CONDXCHG32_RTN_B64 @@ -812,240 +825,336 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>; // TODO: _SRC2_* forms -def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>; -def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>; -def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>; -def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "DS_WRITE_B64", VReg_64>; +def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VReg_32>; +def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VReg_32>; +def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VReg_32>; +def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>; -def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>; -def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>; -def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>; -def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>; -def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>; -def DS_READ_B64 : DS_Load_Helper <0x00000076, "DS_READ_B64", VReg_64>; +def DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VReg_32>; +def DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VReg_32>; +def DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VReg_32>; +def DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VReg_32>; +def DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VReg_32>; +def DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>; // 2 forms. -def DS_WRITE2_B32 : DS_Load2_Helper <0x0000000E, "DS_WRITE2_B32", VReg_64>; -def DS_WRITE2_B64 : DS_Load2_Helper <0x0000004E, "DS_WRITE2_B64", VReg_128>; +def DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VReg_32>; +def DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VReg_32>; +def DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>; +def DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>; -def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "DS_READ2_B32", VReg_64>; -def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>; - -// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64, -// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64 +def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; +def DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; +def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>; +def DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>; //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// -//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>; -//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>; -//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>; -//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>; -//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; -//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; -//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>; +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>; +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>; +//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>; +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>; +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>; +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global + 0x00000008, "buffer_load_ubyte", VReg_32, i32, az_extloadi8_global >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global + 0x00000009, "buffer_load_sbyte", VReg_32, i32, sextloadi8_global >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global + 0x0000000a, "buffer_load_ushort", VReg_32, i32, az_extloadi16_global >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global + 0x0000000b, "buffer_load_sshort", VReg_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load + 0x0000000c, "buffer_load_dword", VReg_32, i32, global_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load + 0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load ->; - -def BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global ->; - -def BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global ->; - -def BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store ->; - -def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store ->; - -def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store ->; -//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; -//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>; -//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>; -//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>; -//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>; -//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>; -//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>; -//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>; -//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>; -//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>; -//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>; -//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>; -//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>; -//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>; -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>; -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>; -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>; -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>; -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>; -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>; + 0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load +>; + +defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < + 0x00000018, "buffer_store_byte", VReg_32, i32, truncstorei8_global +>; + +defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < + 0x0000001a, "buffer_store_short", VReg_32, i32, truncstorei16_global +>; + +defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < + 0x0000001c, "buffer_store_dword", VReg_32, i32, global_store +>; + +defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + 0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store +>; + +defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < + 0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store +>; +//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>; +defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < + 0x00000030, "buffer_atomic_swap", VReg_32, i32, atomic_swap_global +>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < + 0x00000032, "buffer_atomic_add", VReg_32, i32, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < + 0x00000033, "buffer_atomic_sub", VReg_32, i32, atomic_sub_global +>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>; +defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < + 0x00000035, "buffer_atomic_smin", VReg_32, i32, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < + 0x00000036, "buffer_atomic_umin", VReg_32, i32, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < + 0x00000037, "buffer_atomic_smax", VReg_32, i32, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < + 0x00000038, "buffer_atomic_umax", VReg_32, i32, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND : MUBUF_Atomic < + 0x00000039, "buffer_atomic_and", VReg_32, i32, atomic_and_global +>; +defm BUFFER_ATOMIC_OR : MUBUF_Atomic < + 0x0000003a, "buffer_atomic_or", VReg_32, i32, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < + 0x0000003b, "buffer_atomic_xor", VReg_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>; +//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>; +//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>; +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>; +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>; +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>; +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>; +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>; +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>; //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>; -def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>; -def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>; -def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>; -def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>; -def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>; +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VReg_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">; -//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; -//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>; -//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>; -//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>; -//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>; -//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>; -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">; -//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>; -//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>; -//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>; -//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>; -//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>; -//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>; -//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>; -//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>; -//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>; -//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>; -//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>; -//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>; -//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">; -//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">; -//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">; -defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">; -//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>; -//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>; -defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">; -//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">; -//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; -//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>; -//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>; -//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>; -//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>; -//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>; -//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>; -//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>; -//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>; -//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>; -//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>; -//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>; -//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>; -//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>; -//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>; -//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>; -//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; -//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; -//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; -defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">; -defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">; -defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">; -defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">; -defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">; -defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">; -defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">; -defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">; -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">; -//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; -//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; -//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; -//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>; -//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>; -//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>; -//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>; -//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>; -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; +defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; +defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; +defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; +defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; +defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; +defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; +defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; +defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "image_get_lod">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; //===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasFlatAddressSpace] in { +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VReg_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VReg_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VReg_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VReg_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VReg_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>; + +def FLAT_STORE_BYTE : FLAT_Store_Helper < + 0x00000018, "flat_store_byte", VReg_32 +>; + +def FLAT_STORE_SHORT : FLAT_Store_Helper < + 0x0000001a, "flat_store_short", VReg_32 +>; + +def FLAT_STORE_DWORD : FLAT_Store_Helper < + 0x0000001c, "flat_store_dword", VReg_32 +>; + +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + 0x0000001d, "flat_store_dwordx2", VReg_64 +>; + +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + 0x0000001e, "flat_store_dwordx4", VReg_128 +>; + +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + 0x0000001e, "flat_store_dwordx3", VReg_96 +>; + +//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "flat_atomic_swap", []>; +//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "flat_atomic_cmpswap", []>; +//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "flat_atomic_add", []>; +//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "flat_atomic_sub", []>; +//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "flat_atomic_rsub", []>; +//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "flat_atomic_smin", []>; +//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "flat_atomic_umin", []>; +//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "flat_atomic_smax", []>; +//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "flat_atomic_umax", []>; +//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "flat_atomic_and", []>; +//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "flat_atomic_or", []>; +//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "flat_atomic_xor", []>; +//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "flat_atomic_inc", []>; +//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "flat_atomic_dec", []>; +//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "flat_atomic_fcmpswap", []>; +//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "flat_atomic_fmin", []>; +//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "flat_atomic_fmax", []>; +//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "flat_atomic_swap_x2", []>; +//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "flat_atomic_cmpswap_x2", []>; +//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "flat_atomic_add_x2", []>; +//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "flat_atomic_sub_x2", []>; +//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "flat_atomic_rsub_x2", []>; +//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "flat_atomic_smin_x2", []>; +//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "flat_atomic_umin_x2", []>; +//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "flat_atomic_smax_x2", []>; +//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "flat_atomic_umax_x2", []>; +//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "flat_atomic_and_x2", []>; +//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "flat_atomic_or_x2", []>; +//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "flat_atomic_xor_x2", []>; +//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "flat_atomic_inc_x2", []>; +//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "flat_atomic_dec_x2", []>; +//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "flat_atomic_fcmpswap_x2", []>; +//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "flat_atomic_fmin_x2", []>; +//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "flat_atomic_fmax_x2", []>; + +} // End HasFlatAddressSpace predicate +//===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// -//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; +//def V_NOP : VOP1_ <0x00000000, "v_nop", []>; -let neverHasSideEffects = 1, isMoveImm = 1 in { -defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>; -} // End neverHasSideEffects = 1, isMoveImm = 1 +let isMoveImm = 1 in { +defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; +} // End isMoveImm = 1 let Uses = [EXEC] in { @@ -1053,136 +1162,139 @@ def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), (ins VReg_32:$src0), - "V_READFIRSTLANE_B32 $vdst, $src0", + "v_readfirstlane_b32 $vdst, $src0", [] >; } -defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64", - [(set i32:$dst, (fp_to_sint f64:$src0))] +defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", + VOP_I32_F64, fp_to_sint >; -defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32", - [(set f64:$dst, (sint_to_fp i32:$src0))] +defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32", + VOP_F64_I32, sint_to_fp >; -defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", - [(set f32:$dst, (sint_to_fp i32:$src0))] +defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32", + VOP_F32_I32, sint_to_fp >; -defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", - [(set f32:$dst, (uint_to_fp i32:$src0))] +defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32", + VOP_F32_I32, uint_to_fp >; -defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", - [(set i32:$dst, (fp_to_uint f32:$src0))] +defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32", + VOP_I32_F32, fp_to_uint >; -defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", - [(set i32:$dst, (fp_to_sint f32:$src0))] +defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32", + VOP_I32_F32, fp_to_sint >; -defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; -defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32", - [(set i32:$dst, (f32_to_f16 f32:$src0))] +defm V_MOV_FED_B32 : VOP1Inst <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; +defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", + VOP_I32_F32, fp_to_f16 >; -defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", - [(set f32:$dst, (f16_to_f32 i32:$src0))] +defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", + VOP_F32_I32, f16_to_fp >; -//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; -//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; -//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; -defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64", - [(set f32:$dst, (fround f64:$src0))] +//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>; +//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>; +//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>; +defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", + VOP_F32_F64, fround >; -defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32", - [(set f64:$dst, (fextend f32:$src0))] +defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32", + VOP_F64_F32, fextend >; -defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))] +defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0", + VOP_F32_I32, AMDGPUcvt_f32_ubyte0 >; -defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))] +defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1", + VOP_F32_I32, AMDGPUcvt_f32_ubyte1 >; -defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))] +defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2", + VOP_F32_I32, AMDGPUcvt_f32_ubyte2 >; -defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))] +defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3", + VOP_F32_I32, AMDGPUcvt_f32_ubyte3 >; -defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64", - [(set i32:$dst, (fp_to_uint f64:$src0))] +defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64", + VOP_I32_F64, fp_to_uint >; -defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32", - [(set f64:$dst, (uint_to_fp i32:$src0))] +defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", + VOP_F64_I32, uint_to_fp >; -defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", - [(set f32:$dst, (AMDGPUfract f32:$src0))] +defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32", + VOP_F32_F32, AMDGPUfract >; -defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", - [(set f32:$dst, (ftrunc f32:$src0))] +defm V_TRUNC_F32 : VOP1Inst <vop1<0x21>, "v_trunc_f32", + VOP_F32_F32, ftrunc >; -defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", - [(set f32:$dst, (fceil f32:$src0))] +defm V_CEIL_F32 : VOP1Inst <vop1<0x22>, "v_ceil_f32", + VOP_F32_F32, fceil >; -defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", - [(set f32:$dst, (frint f32:$src0))] +defm V_RNDNE_F32 : VOP1Inst <vop1<0x23>, "v_rndne_f32", + VOP_F32_F32, frint >; -defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", - [(set f32:$dst, (ffloor f32:$src0))] +defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32", + VOP_F32_F32, ffloor >; -defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", - [(set f32:$dst, (fexp2 f32:$src0))] +defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32", + VOP_F32_F32, fexp2 >; -defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; -defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", - [(set f32:$dst, (flog2 f32:$src0))] +defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; +defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32", + VOP_F32_F32, flog2 >; -defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; -defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; -defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (AMDGPUrcp f32:$src0))] +defm V_RCP_CLAMP_F32 : VOP1Inst <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1Inst <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; +defm V_RCP_F32 : VOP1Inst <vop1<0x2a>, "v_rcp_f32", + VOP_F32_F32, AMDGPUrcp +>; +defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b>, "v_rcp_iflag_f32", VOP_F32_F32>; +defm V_RSQ_CLAMP_F32 : VOP1Inst <vop1<0x2c>, "v_rsq_clamp_f32", + VOP_F32_F32, AMDGPUrsq_clamped +>; +defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32", + VOP_F32_F32, AMDGPUrsq_legacy >; -defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; -defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", - [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))] +defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32", + VOP_F32_F32, AMDGPUrsq >; -defm V_RSQ_LEGACY_F32 : VOP1_32 < - 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))] +defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64", + VOP_F64_F64, AMDGPUrcp >; -defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", - [(set f32:$dst, (AMDGPUrsq f32:$src0))] +defm V_RCP_CLAMP_F64 : VOP1Inst <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64", + VOP_F64_F64, AMDGPUrsq >; -defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (AMDGPUrcp f64:$src0))] +defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64", + VOP_F64_F64, AMDGPUrsq_clamped >; -defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; -defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", - [(set f64:$dst, (AMDGPUrsq f64:$src0))] +defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32", + VOP_F32_F32, fsqrt >; -defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", - [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))] +defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64", + VOP_F64_F64, fsqrt >; -defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", - [(set f32:$dst, (fsqrt f32:$src0))] +defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32", + VOP_F32_F32, AMDGPUsin >; -defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", - [(set f64:$dst, (fsqrt f64:$src0))] +defm V_COS_F32 : VOP1Inst <vop1<0x36>, "v_cos_f32", + VOP_F32_F32, AMDGPUcos >; -defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>; -defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>; -defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>; -defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>; -defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>; -defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>; -defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>; -//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>; -defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>; -defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>; -//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>; -defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>; -//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>; -defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>; -defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>; -defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>; +defm V_NOT_B32 : VOP1Inst <vop1<0x37>, "v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <vop1<0x38>, "v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <vop1<0x39>, "v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <vop1<0x3a>, "v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <vop1<0x3b>, "v_ffbh_i32", VOP_I32_I32>; +//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>; +defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d>, "v_frexp_mant_f64", VOP_F64_F64>; +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e>, "v_fract_f64", VOP_F64_F64>; +//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>; +defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40>, "v_frexp_mant_f32", VOP_F32_F32>; +//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>; +defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42>, "v_movreld_b32", VOP_I32_I32>; +defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43>, "v_movrels_b32", VOP_I32_I32>; +defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>; //===----------------------------------------------------------------------===// @@ -1193,7 +1305,7 @@ def V_INTERP_P1_F32 : VINTRP < 0x00000000, (outs VReg_32:$dst), (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]", + "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]", []> { let DisableEncoding = "$m0"; } @@ -1202,7 +1314,7 @@ def V_INTERP_P2_F32 : VINTRP < 0x00000001, (outs VReg_32:$dst), (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", + "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", []> { let Constraints = "$src0 = $dst"; @@ -1214,7 +1326,7 @@ def V_INTERP_MOV_F32 : VINTRP < 0x00000002, (outs VReg_32:$dst), (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]", + "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]", []> { let DisableEncoding = "$m0"; } @@ -1225,16 +1337,15 @@ def V_INTERP_MOV_F32 : VINTRP < def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), - "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]", + "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]", [] >{ let DisableEncoding = "$vcc"; } def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), - (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2, - InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", + (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2), + "v_cndmask_b32_e64 $dst, $src0, $src1, $src2", [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))] > { let src0_modifiers = 0; @@ -1246,7 +1357,7 @@ def V_READLANE_B32 : VOP2 < 0x00000001, (outs SReg_32:$vdst), (ins VReg_32:$src0, SSrc_32:$vsrc1), - "V_READLANE_B32 $vdst, $src0, $vsrc1", + "v_readlane_b32 $vdst, $src0, $vsrc1", [] >; @@ -1254,245 +1365,320 @@ def V_WRITELANE_B32 : VOP2 < 0x00000002, (outs VReg_32:$vdst), (ins SReg_32:$src0, SSrc_32:$vsrc1), - "V_WRITELANE_B32 $vdst, $src0, $vsrc1", + "v_writelane_b32 $vdst, $src0, $vsrc1", [] >; let isCommutable = 1 in { -defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", - [(set f32:$dst, (fadd f32:$src0, f32:$src1))] +defm V_ADD_F32 : VOP2Inst <vop2<0x3>, "v_add_f32", + VOP_F32_F32_F32, fadd >; -defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", - [(set f32:$dst, (fsub f32:$src0, f32:$src1))] +defm V_SUB_F32 : VOP2Inst <vop2<0x4>, "v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32", + VOP_F32_F32_F32, null_frag, "v_sub_f32" >; -defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">; } // End isCommutable = 1 -defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; - let isCommutable = 1 in { -defm V_MUL_LEGACY_F32 : VOP2_32 < - 0x00000007, "V_MUL_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))] +defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32", + VOP_F32_F32_F32 >; -defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", - [(set f32:$dst, (fmul f32:$src0, f32:$src1))] +defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32", + VOP_F32_F32_F32, int_AMDGPU_mul >; +defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32", + VOP_F32_F32_F32, fmul +>; -defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", - [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))] +defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24", + VOP_I32_I32_I32, AMDGPUmul_i24 >; -//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; -defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", - [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))] +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>; +defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24", + VOP_I32_I32_I32, AMDGPUmul_u24 >; -//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>; -defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", - [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))] +defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmin_legacy >; -defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", - [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))] +defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmax_legacy >; -defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; -defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; -defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", - [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>; -defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", - [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>; -defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", - [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>; -defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", - [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>; +defm V_MIN_F32 : VOP2Inst <vop2<0xf>, "v_min_f32", VOP_F32_F32_F32, fminnum>; +defm V_MAX_F32 : VOP2Inst <vop2<0x10>, "v_max_f32", VOP_F32_F32_F32, fmaxnum>; +defm V_MIN_I32 : VOP2Inst <vop2<0x11>, "v_min_i32", VOP_I32_I32_I32, AMDGPUsmin>; +defm V_MAX_I32 : VOP2Inst <vop2<0x12>, "v_max_i32", VOP_I32_I32_I32, AMDGPUsmax>; +defm V_MIN_U32 : VOP2Inst <vop2<0x13>, "v_min_u32", VOP_I32_I32_I32, AMDGPUumin>; +defm V_MAX_U32 : VOP2Inst <vop2<0x14>, "v_max_u32", VOP_I32_I32_I32, AMDGPUumax>; -defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] ->; +defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>; -defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">; +defm V_LSHRREV_B32 : VOP2Inst < + vop2<0x16>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32" +>; -defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] +defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32", + VOP_I32_I32_I32, sra +>; +defm V_ASHRREV_I32 : VOP2Inst < + vop2<0x18>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32" >; -defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">; let hasPostISelHook = 1 in { -defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] ->; +defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>; } -defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">; +defm V_LSHLREV_B32 : VOP2Inst < + vop2<0x1a>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32" +>; -defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", - [(set i32:$dst, (and i32:$src0, i32:$src1))]>; -defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] +defm V_AND_B32 : VOP2Inst <vop2<0x1b>, "v_and_b32", + VOP_I32_I32_I32, and>; +defm V_OR_B32 : VOP2Inst <vop2<0x1c>, "v_or_b32", + VOP_I32_I32_I32, or >; -defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] +defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32", + VOP_I32_I32_I32, xor >; } // End isCommutable = 1 -defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; -defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; -defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; -defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; -defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; -defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; +defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", + VOP_I32_I32_I32, AMDGPUbfm>; + +let isCommutable = 1 in { +defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + +defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>; + +let isCommutable = 1 in { +defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + + +defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>; +defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32", + + VOP_I32_I32_I32 +>; +defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32 +>; let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. -defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", - [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>; -defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", - [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>; -defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32, - "V_SUB_I32">; +defm V_ADD_I32 : VOP2bInst <vop2<0x25>, "v_add_i32", + VOP_I32_I32_I32, add +>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26>, "v_sub_i32", + VOP_I32_I32_I32, sub +>; +defm V_SUBREV_I32 : VOP2bInst <vop2<0x27>, "v_subrev_i32", + VOP_I32_I32_I32, null_frag, "v_sub_i32" +>; let Uses = [VCC] in { // Carry-in comes from VCC -defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", - [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>; -defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", - [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>; -defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32, - "V_SUBB_U32">; +defm V_ADDC_U32 : VOP2bInst <vop2<0x28>, "v_addc_u32", + VOP_I32_I32_I32_VCC, adde +>; +defm V_SUBB_U32 : VOP2bInst <vop2<0x29>, "v_subb_u32", + VOP_I32_I32_I32_VCC, sube +>; +defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a>, "v_subbrev_u32", + VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" +>; + } // End Uses = [VCC] } // End isCommutable = 1, Defs = [VCC] -defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; -////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>; -////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; -////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", - [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))] +defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32", + VOP_F32_F32_I32, AMDGPUldexp +>; +////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>; +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>; +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 >; -////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; -////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>; +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>; //===----------------------------------------------------------------------===// // VOP3 Instructions //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32", + VOP_F32_F32_F32_F32 +>; -defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; -defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", - [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] +defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32", + VOP_F32_F32_F32_F32, fmad >; -defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", - [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))] + +defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24", + VOP_I32_I32_I32_I32, AMDGPUmad_i24 >; -defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", - [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))] +defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24", + VOP_I32_I32_I32_I32, AMDGPUmad_u24 >; +} // End isCommutable = 1 -} // End neverHasSideEffects +defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBESC_F32 : VOP3Inst <vop3<0x145>, "v_cubesc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBETC_F32 : VOP3Inst <vop3<0x146>, "v_cubetc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147>, "v_cubema_f32", + VOP_F32_F32_F32_F32 +>; +defm V_BFE_U32 : VOP3Inst <vop3<0x148>, "v_bfe_u32", + VOP_I32_I32_I32_I32, AMDGPUbfe_u32 +>; +defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32", + VOP_I32_I32_I32_I32, AMDGPUbfe_i32 +>; +defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32", + VOP_I32_I32_I32_I32, AMDGPUbfi +>; -defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; -defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; -defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; -defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; +let isCommutable = 1 in { +defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32", + VOP_F32_F32_F32_F32, fma +>; +defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64", + VOP_F64_F64_F64_F64, fma +>; +} // End isCommutable = 1 -let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in { -defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", - [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>; -defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", - [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>; -} +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; +defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32", + VOP_I32_I32_I32_I32 +>; +defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f>, "v_alignbyte_b32", + VOP_I32_I32_I32_I32 +>; +defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", + VOP_F32_F32_F32_F32>; +defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; -defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", - [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>; -defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", - [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))] ->; -def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", - [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] ->; -//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; -defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; - -defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; -defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; -////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; -////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; -////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; -////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>; -////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>; -////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>; -////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; -////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; -////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; -//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; -//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; -//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; -defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; -////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", - [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] ->; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", - [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] ->; - -def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] +defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 >; -def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] +defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32", + VOP_I32_I32_I32_I32, AMDGPUumin3 >; -def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] +defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>; +//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>; +//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>; +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; +defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32", + VOP_I32_I32_I32_I32 +>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +defm V_DIV_FIXUP_F32 : VOP3Inst < + vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup +>; +defm V_DIV_FIXUP_F64 : VOP3Inst < + vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup +>; + +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", + VOP_I64_I64_I32, shl +>; +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", + VOP_I64_I64_I32, srl +>; +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", + VOP_I64_I64_I32, sra >; let isCommutable = 1 in { -def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>; -def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>; -def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>; -def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; +defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64", + VOP_F64_F64_F64, fadd +>; +defm V_MUL_F64 : VOP3Inst <vop3<0x165>, "v_mul_f64", + VOP_F64_F64_F64, fmul +>; + +defm V_MIN_F64 : VOP3Inst <vop3<0x166>, "v_min_f64", + VOP_F64_F64_F64, fminnum +>; +defm V_MAX_F64 : VOP3Inst <vop3<0x167>, "v_max_f64", + VOP_F64_F64_F64, fmaxnum +>; } // isCommutable = 1 -def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; +defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64", + VOP_F64_F64_I32, AMDGPUldexp +>; let isCommutable = 1 in { -defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; -defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; -defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; -defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; +defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a>, "v_mul_hi_u32", + VOP_I32_I32_I32 +>; +defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b>, "v_mul_lo_i32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32", + VOP_I32_I32_I32 +>; } // isCommutable = 1 -def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; +defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>; // Double precision division pre-scale. -def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; +defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>; -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", - [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +let isCommutable = 1 in { +defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32", + VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", - [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64", + VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; -//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; -//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; -//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", - [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +} // End isCommutable = 1 + +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; + +defm V_TRIG_PREOP_F64 : VOP3Inst < + vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; //===----------------------------------------------------------------------===// @@ -1517,6 +1703,15 @@ def V_OR_I1 : InstSI < [(set i1:$dst, (or i1:$src0, i1:$src1))] >; +def V_XOR_I1 : InstSI < + (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", + [(set i1:$dst, (xor i1:$src0, i1:$src1))] +>; + +let hasSideEffects = 1 in { +def SGPR_USE : InstSI <(outs),(ins), "", []>; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. @@ -1544,7 +1739,7 @@ def SI_ELSE : InstSI < def SI_LOOP : InstSI < (outs), (ins SReg_64:$saved, brtarget:$target), - "SI_LOOP $saved, $target", + "si_loop $saved, $target", [(int_SI_loop i64:$saved, bb:$target)] >; @@ -1553,35 +1748,35 @@ def SI_LOOP : InstSI < def SI_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src), - "SI_ELSE $dst, $src", + "si_else $dst, $src", [(set i64:$dst, (int_SI_break i64:$src))] >; def SI_IF_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), - "SI_IF_BREAK $dst, $vcc, $src", + "si_if_break $dst, $vcc, $src", [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] >; def SI_ELSE_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), - "SI_ELSE_BREAK $dst, $src0, $src1", + "si_else_break $dst, $src0, $src1", [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] >; def SI_END_CF : InstSI < (outs), (ins SReg_64:$saved), - "SI_END_CF $saved", + "si_end_cf $saved", [(int_SI_end_cf i64:$saved)] >; def SI_KILL : InstSI < (outs), (ins VSrc_32:$src), - "SI_KILL $src", + "si_kill $src", [(int_AMDGPU_kill f32:$src)] >; @@ -1623,14 +1818,14 @@ def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; def SI_INDIRECT_SRC : InstSI < (outs VReg_32:$dst, SReg_64:$temp), (ins unknown:$src, VSrc_32:$idx, i32imm:$off), - "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off", + "si_indirect_src $dst, $temp, $src, $idx, $off", [] >; class SI_INDIRECT_DST<RegisterClass rc> : InstSI < (outs rc:$dst, SReg_64:$temp), (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val), - "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val", + "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", [] > { let Constraints = "$src = $dst"; @@ -1646,18 +1841,10 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; let usesCustomInserter = 1 in { -// This pseudo instruction takes a pointer as input and outputs a resource -// constant that can be used with the ADDR64 MUBUF instructions. -def SI_ADDR64_RSRC : InstSI < - (outs SReg_128:$srsrc), - (ins SSrc_64:$ptr), - "", [] ->; - def V_SUB_F64 : InstSI < (outs VReg_64:$dst), (ins VReg_64:$src0, VReg_64:$src1), - "V_SUB_F64 $dst, $src0, $src1", + "v_sub_f64 $dst, $src0, $src1", [(set f64:$dst, (fsub f64:$src0, f64:$src1))] >; @@ -1666,14 +1853,14 @@ def V_SUB_F64 : InstSI < multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { def _SAVE : InstSI < - (outs VReg_32:$dst), + (outs), (ins sgpr_class:$src, i32imm:$frame_idx), "", [] >; def _RESTORE : InstSI < (outs sgpr_class:$dst), - (ins VReg_32:$src, i32imm:$frame_idx), + (ins i32imm:$frame_idx), "", [] >; @@ -1685,6 +1872,37 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx), + "", [] + >; +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; + +let Defs = [SCC] in { + +def SI_CONSTDATA_PTR : InstSI < + (outs SReg_64:$dst), + (ins), + "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] +>; + +} // End Defs = [SCC] + } // end IsCodeGenOnly, isPseudo } // end SubtargetPredicate = SI @@ -1693,7 +1911,9 @@ let Predicates = [isSI] in { def : Pat< (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), - (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0)) + (V_CNDMASK_B32_e64 $src2, $src1, + (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, + DSTCLAMP.NONE, DSTOMOD.NONE)) >; def : Pat < @@ -1766,27 +1986,26 @@ def : Pat < // SOP1 Patterns //===----------------------------------------------------------------------===// -let Predicates = [isSI, isCFDepth0] in { - def : Pat < (i64 (ctpop i64:$src)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_BCNT1_I32_B64 $src), sub0), - (S_MOV_B32 0), sub1) + (i64 (REG_SEQUENCE SReg_64, + (S_BCNT1_I32_B64 $src), sub0, + (S_MOV_B32 0), sub1)) >; -} // Predicates = [isSI, isCFDepth0] - -let Predicates = [isSI] in { //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// +// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector +// case, the sgpr-copies pass will fix this to use the vector version. def : Pat < - (i1 (xor i1:$src0, i1:$src1)), - (S_XOR_B64 $src0, $src1) + (i32 (addc i32:$src0, i32:$src1)), + (S_ADD_U32 $src0, $src1) >; +let Predicates = [isSI] in { + //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// @@ -1800,67 +2019,106 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -def : RcpPat<V_RCP_F32_e32, f32>; +let Predicates = [UnsafeFPMath] in { def : RcpPat<V_RCP_F64_e32, f64>; -defm : RsqPat<V_RSQ_F32_e32, f32>; defm : RsqPat<V_RSQ_F64_e32, f64>; +defm : RsqPat<V_RSQ_F32_e32, f32>; +} //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// -class BinOp64Pat <SDNode node, Instruction inst> : Pat < - (node i64:$src0, i64:$src1), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (inst (EXTRACT_SUBREG i64:$src0, sub0), - (EXTRACT_SUBREG i64:$src1, sub0)), sub0), - (inst (EXTRACT_SUBREG i64:$src0, sub1), - (EXTRACT_SUBREG i64:$src1, sub1)), sub1) ->; - -def : BinOp64Pat <or, V_OR_B32_e32>; -def : BinOp64Pat <xor, V_XOR_B32_e32>; - -class SextInReg <ValueType vt, int ShiftAmt> : Pat < - (sext_inreg i32:$src0, vt), - (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0)) ->; - -def : SextInReg <i8, 24>; -def : SextInReg <i16, 16>; - def : Pat < (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), - (V_BCNT_U32_B32_e32 $popcnt, $val) ->; - -def : Pat < - (i32 (ctpop i32:$popcnt)), - (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0) ->; - -def : Pat < - (i64 (ctpop i64:$src)), - (INSERT_SUBREG - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1), - (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)), - sub0), - (V_MOV_B32_e32 0), sub1) + (V_BCNT_U32_B32_e64 $popcnt, $val) >; /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ +// Image + sampler class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm, + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), $addr, $rsrc, $sampler) >; +multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>; + def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>; +} + +// Image only +class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc) +>; + +multiclass ImagePatterns<SDPatternOperator name, string opcode> { + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; +} + +// Basic sample +defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; +defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; +defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; +defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison +defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; +defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets +defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; +defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets +defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; +defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; + +// Gather opcodes // Only the variants which make sense are defined. def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; @@ -1905,6 +2163,10 @@ def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; +def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; +defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; +defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; + /* SIsample for simple 1D texture lookup */ def : Pat < (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), @@ -2143,62 +2405,63 @@ def : BitConvert <v16f32, v16i32, VReg_512>; /********** Src & Dst modifiers **********/ /********** =================== **********/ -def FCLAMP_SI : AMDGPUShaderInst < - (outs VReg_32:$dst), - (ins VSrc_32:$src0), - "FCLAMP_SI $dst, $src0", - [] -> { - let usesCustomInserter = 1; -} - def : Pat < - (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), - (FCLAMP_SI f32:$src) + (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), + (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) >; /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ -// Manipulate the sign bit directly, as e.g. using the source negation modifier -// in V_ADD_F32_e64 $src, 0, [...] does not result in -0.0 for $src == +0.0, -// breaking the piglit *s-floatBitsToInt-neg* tests - -// TODO: Look into not implementing isFNegFree/isFAbsFree for SI, and possibly -// removing these patterns +// Prevent expanding both fneg and fabs. +// FIXME: Should use S_OR_B32 def : Pat < (fneg (fabs f32:$src)), (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ >; -def FABS_SI : AMDGPUShaderInst < - (outs VReg_32:$dst), - (ins VSrc_32:$src0), - "FABS_SI $dst, $src0", - [] -> { - let usesCustomInserter = 1; -} +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f64:$src)), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), // Set sign bit. + sub1) +>; def : Pat < (fabs f32:$src), - (FABS_SI f32:$src) + (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) >; -def FNEG_SI : AMDGPUShaderInst < - (outs VReg_32:$dst), - (ins VSrc_32:$src0), - "FNEG_SI $dst, $src0", - [] -> { - let usesCustomInserter = 1; -} - def : Pat < (fneg f32:$src), - (FNEG_SI f32:$src) + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) +>; + +def : Pat < + (fabs f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + sub1) +>; + +def : Pat < + (fneg f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), + sub1) >; /********** ================== **********/ @@ -2260,44 +2523,31 @@ def : Pat < >; def : Pat< - (fdiv f32:$src0, f32:$src1), - (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) ->; - -def : Pat< (fdiv f64:$src0, f64:$src1), - (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) ->; - -def : Pat < - (fcos f32:$src0), - (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) ->; - -def : Pat < - (fsin f32:$src0), - (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) + (V_MUL_F64 0 /* src0_modifiers */, $src0, + 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1), + 0 /* clamp */, 0 /* omod */) >; def : Pat < (int_AMDGPU_cube v4f32:$src), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), - (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub0), - (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub1), - (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub2), - (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub3) + (REG_SEQUENCE VReg_128, + (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub0, + (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub1, + (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub2, + (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub3) >; def : Pat < @@ -2316,7 +2566,7 @@ def : Ext32Pat <anyext>; // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) >; // The multiplication scales from [0,1] to the unsigned integer range @@ -2330,7 +2580,7 @@ def : Pat < def : Pat < (int_SI_tid), (V_MBCNT_HI_U32_B32_e32 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0)) + (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) >; //===----------------------------------------------------------------------===// @@ -2341,84 +2591,73 @@ def : IMad24Pat<V_MAD_I32_I24>; def : UMad24Pat<V_MAD_U32_U24>; def : Pat < - (fadd f64:$src0, f64:$src1), - (V_ADD_F64 $src0, $src1, (i64 0)) ->; - -def : Pat < - (fmul f64:$src0, f64:$src1), - (V_MUL_F64 $src0, $src1, (i64 0)) ->; - -def : Pat < - (mul i32:$src0, i32:$src1), - (V_MUL_LO_I32 $src0, $src1, (i32 0)) ->; - -def : Pat < (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1, (i32 0)) + (V_MUL_HI_U32 $src0, $src1) >; def : Pat < (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1, (i32 0)) + (V_MUL_HI_I32 $src0, $src1) >; -defm : BFIPatterns <V_BFI_B32, S_MOV_B32>; +def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>; + + +defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; /********** ======================= **********/ /********** Load/Store Patterns **********/ /********** ======================= **********/ -multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))), - (inst (i1 0), $ptr, (as_i16imm $offset)) - >; +class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst (i1 0), $ptr, (as_i16imm $offset)) +>; - def : Pat < - (frag i32:$src0), - (vt (inst 0, $src0, 0)) - >; -} +def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; +def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; +def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; +def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; +def : DSReadPat <DS_READ_B32, i32, local_load>; -defm : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; -defm : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; -defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; -defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; -defm : DSReadPat <DS_READ_B32, i32, local_load>; -defm : DSReadPat <DS_READ_B64, v2i32, local_load>; +let AddedComplexity = 100 in { -multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) - >; +def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>; - def : Pat < - (frag vt:$val, i32:$ptr), - (inst 0, $ptr, $val, 0) - >; -} +} // End AddedComplexity = 100 -defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; -defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; -defm : DSWritePat <DS_WRITE_B32, i32, local_store>; -defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>; +def : Pat < + (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1) +>; -multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) - >; +class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), + (inst (i1 0), $ptr, $value, (as_i16imm $offset)) +>; - def : Pat < - (frag i32:$ptr, vt:$val), - (inst 0, $ptr, $val, 0) - >; -} +def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; +def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; +def : DSWritePat <DS_WRITE_B32, i32, local_store>; + +let AddedComplexity = 100 in { + +def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>; +} // End AddedComplexity = 100 + +def : Pat < + (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1) +>; + +class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst (i1 0), $ptr, $value, (as_i16imm $offset)) +>; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec // @@ -2430,69 +2669,56 @@ multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> { // We also load this -1 with s_mov_b32 / s_mov_b64 even though this // needs to be a VGPR. The SGPR copy pass will fix this, and it's // easier since there is no v_mov_b64. -multiclass DSAtomicIncRetPat<DS inst, ValueType vt, - Instruction LoadImm, PatFrag frag> { - def : Pat < - (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)), - (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) - >; - - def : Pat < - (frag i32:$ptr, (vt 1)), - (inst 0, $ptr, (LoadImm (vt -1)), 0) - >; -} +class DSAtomicIncRetPat<DS inst, ValueType vt, + Instruction LoadImm, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), + (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) +>; -multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap), - (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) - >; - def : Pat < - (frag i32:$ptr, vt:$cmp, vt:$swap), - (inst 0, $ptr, $cmp, $swap, 0) - >; -} +class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), + (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) +>; // 32-bit atomics. -defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - S_MOV_B32, atomic_load_add_local>; -defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - S_MOV_B32, atomic_load_sub_local>; - -defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; -defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; -defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; -defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; -defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; -defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; - -defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; +def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, + S_MOV_B32, atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, + S_MOV_B32, atomic_load_sub_local>; + +def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; // 64-bit atomics. -defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - S_MOV_B64, atomic_load_add_local>; -defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - S_MOV_B64, atomic_load_sub_local>; +def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, + S_MOV_B64, atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, + S_MOV_B64, atomic_load_sub_local>; -defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; -defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; -defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; -defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; -defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; -defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; +def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; -defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; +def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; //===----------------------------------------------------------------------===// @@ -2502,43 +2728,50 @@ defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, PatFrag constant_ld> { def : Pat < - (vt (constant_ld (add i64:$ptr, i64:$offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))), + (Instr_ADDR64 $srsrc, $vaddr, $offset) >; } -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, - sextloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, - az_extloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, - sextloadi16_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, - az_extloadi16_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, - constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, - constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, - constant_load>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; + +class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, MUBUF bothen> { def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc), + (offset $rsrc, (as_i16imm $offset), $soffset, (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm, 1, 0, imm:$glc, imm:$slc, + imm:$offset, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), - (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc), + (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; @@ -2566,6 +2799,32 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; +class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; + +/* +class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $offset) +>; + +def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>; +def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>; + +*/ + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// @@ -2590,28 +2849,39 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; let SubtargetPredicate = isCI in { // Sea island new arithmetic instructinos -let neverHasSideEffects = 1 in { -defm V_TRUNC_F64 : VOP1_64 <0x00000017, "V_TRUNC_F64", - [(set f64:$dst, (ftrunc f64:$src0))] +defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", + VOP_F64_F64, ftrunc >; -defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64", - [(set f64:$dst, (fceil f64:$src0))] +defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", + VOP_F64_F64, fceil >; -defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64", - [(set f64:$dst, (ffloor f64:$src0))] +defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", + VOP_F64_F64, ffloor >; -defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", - [(set f64:$dst, (frint f64:$src0))] +defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", + VOP_F64_F64, frint >; -defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>; -defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>; -defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>; -def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>; +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; // XXX - Does this set VCC? -def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; -} // End neverHasSideEffects = 1 +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 // Remaining instructions: // FLAT_* @@ -2636,6 +2906,37 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; } // End iSCI +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt, + PatFrag flat_ld> : + Pat <(vt (flat_ld i64:$ptr)), + (Instr_ADDR64 $ptr) +>; + +def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>; + +class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> : + Pat <(st vt:$value, i64:$ptr), + (Instr $value, $ptr) + >; + +def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>; +def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>; +def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; /********** ====================== **********/ /********** Indirect adressing **********/ @@ -2685,44 +2986,37 @@ defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; def : Pat<(i32 (sext_inreg i32:$src, i1)), (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 -// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it -// might not be worth the effort, and will need to expand to shifts when -// fixing SGPR copies. - // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16 - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0), - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0), - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i32)), + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 >; class ZExt_i64_i32_Pat <SDNode ext> : Pat < (i64 (ext i32:$src)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), - (S_MOV_B32 0), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) >; class ZExt_i64_i1_Pat <SDNode ext> : Pat < (i64 (ext i1:$src)), - (INSERT_SUBREG - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0), - (S_MOV_B32 0), sub1) + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, + (S_MOV_B32 0), sub1) >; @@ -2733,17 +3027,14 @@ def : ZExt_i64_i1_Pat<anyext>; def : Pat < (i64 (sext i32:$src)), - (INSERT_SUBREG - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), - (S_ASHR_I32 $src, 31), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, + (S_ASHR_I32 $src, 31), sub1) >; def : Pat < (i64 (sext i1:$src)), - (INSERT_SUBREG - (INSERT_SUBREG - (i64 (IMPLICIT_DEF)), - (V_CNDMASK_B32_e64 0, -1, $src), sub0), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 0, -1, $src), sub0, (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; @@ -2778,20 +3069,20 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1) + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) >; -// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector -// case, the sgpr-copies pass will fix this to use the vector version. def : Pat < - (i32 (addc i32:$src0, i32:$src1)), - (S_ADD_I32 $src0, $src1) + (i32 (bswap i32:$a)), + (V_BFI_B32 (S_MOV_B32 0x00ff00ff), + (V_ALIGNBIT_B32 $a, $a, 24), + (V_ALIGNBIT_B32 $a, $a, 8)) >; //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// -def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>; +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; } // End isSI predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index df690a4..027a0a2 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -54,14 +54,12 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - // Fully-flexible SAMPLE instruction. class SampleRaw : Intrinsic < [llvm_v4f32_ty], // vdata(VGPR) [llvm_anyint_ty, // vaddr(VGPR) - llvm_v32i8_ty, // rsrc(SGPR) - llvm_v16i8_ty, // sampler(SGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_v4i32_ty, // sampler(SGPR) llvm_i32_ty, // dmask(imm) llvm_i32_ty, // unorm(imm) llvm_i32_ty, // r128(imm) @@ -72,10 +70,68 @@ let TargetPrefix = "SI", isTarget = 1 in { llvm_i32_ty], // lwe(imm) [IntrNoMem]>; - def int_SI_sample : Sample; - def int_SI_sampleb : Sample; - def int_SI_sampled : Sample; - def int_SI_samplel : Sample; + // Image instruction without a sampler. + class Image : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Basic sample + def int_SI_image_sample : SampleRaw; + def int_SI_image_sample_cl : SampleRaw; + def int_SI_image_sample_d : SampleRaw; + def int_SI_image_sample_d_cl : SampleRaw; + def int_SI_image_sample_l : SampleRaw; + def int_SI_image_sample_b : SampleRaw; + def int_SI_image_sample_b_cl : SampleRaw; + def int_SI_image_sample_lz : SampleRaw; + def int_SI_image_sample_cd : SampleRaw; + def int_SI_image_sample_cd_cl : SampleRaw; + + // Sample with comparison + def int_SI_image_sample_c : SampleRaw; + def int_SI_image_sample_c_cl : SampleRaw; + def int_SI_image_sample_c_d : SampleRaw; + def int_SI_image_sample_c_d_cl : SampleRaw; + def int_SI_image_sample_c_l : SampleRaw; + def int_SI_image_sample_c_b : SampleRaw; + def int_SI_image_sample_c_b_cl : SampleRaw; + def int_SI_image_sample_c_lz : SampleRaw; + def int_SI_image_sample_c_cd : SampleRaw; + def int_SI_image_sample_c_cd_cl : SampleRaw; + + // Sample with offsets + def int_SI_image_sample_o : SampleRaw; + def int_SI_image_sample_cl_o : SampleRaw; + def int_SI_image_sample_d_o : SampleRaw; + def int_SI_image_sample_d_cl_o : SampleRaw; + def int_SI_image_sample_l_o : SampleRaw; + def int_SI_image_sample_b_o : SampleRaw; + def int_SI_image_sample_b_cl_o : SampleRaw; + def int_SI_image_sample_lz_o : SampleRaw; + def int_SI_image_sample_cd_o : SampleRaw; + def int_SI_image_sample_cd_cl_o : SampleRaw; + + // Sample with comparison and offsets + def int_SI_image_sample_c_o : SampleRaw; + def int_SI_image_sample_c_cl_o : SampleRaw; + def int_SI_image_sample_c_d_o : SampleRaw; + def int_SI_image_sample_c_d_cl_o : SampleRaw; + def int_SI_image_sample_c_l_o : SampleRaw; + def int_SI_image_sample_c_b_o : SampleRaw; + def int_SI_image_sample_c_b_cl_o : SampleRaw; + def int_SI_image_sample_c_lz_o : SampleRaw; + def int_SI_image_sample_c_cd_o : SampleRaw; + def int_SI_image_sample_c_cd_cl_o : SampleRaw; // Basic gather4 def int_SI_gather4 : SampleRaw; @@ -111,8 +167,19 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_getlod : SampleRaw; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + // Image instrinsics. + def int_SI_image_load : Image; + def int_SI_image_load_mip : Image; + def int_SI_getresinfo : Image; + // Deprecated image and sample intrinsics. + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_SI_sample : Sample; + def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; + def int_SI_samplel : Sample; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; /* Interpolation Intrinsics */ diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp new file mode 100644 index 0000000..4140196 --- /dev/null +++ b/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -0,0 +1,417 @@ +//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to fuse DS instructions with close by immediate offsets. +// This will fuse operations such as +// ds_read_b32 v0, v2 offset:16 +// ds_read_b32 v1, v2 offset:32 +// ==> +// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 +// +// +// Future improvements: +// +// - This currently relies on the scheduler to place loads and stores next to +// each other, and then only merges adjacent pairs of instructions. It would +// be good to be more flexible with interleaved instructions, and possibly run +// before scheduling. It currently missing stores of constants because loading +// the constant into the data register is placed between the stores, although +// this is arguably a scheduling problem. +// +// - Live interval recomputing seems inefficient. This currently only matches +// one pair, and recomputes live intervals and moves on to the next pair. It +// would be better to compute a list of all merges that need to occur +// +// - With a list of instructions to process, we can also merge more. If a +// cluster of loads have offsets that are too large to fit in the 8-bit +// offsets, but are close enough to fit in the 8 bits, we can add to the base +// pointer and use the new reduced offsets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-load-store-opt" + +namespace { + +class SILoadStoreOptimizer : public MachineFunctionPass { +private: + const TargetMachine *TM; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + + static bool offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned EltSize); + + MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize); + + void updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx); + + MachineBasicBlock::iterator mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + + MachineBasicBlock::iterator mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + +public: + static char ID; + + SILoadStoreOptimizer() : + MachineFunctionPass(ID), + TM(nullptr), + TII(nullptr), + TRI(nullptr), + MRI(nullptr), + LIS(nullptr) { + + } + + SILoadStoreOptimizer(const TargetMachine &TM_) : + MachineFunctionPass(ID), + TM(&TM_), + TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) { + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + } + + bool optimizeBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Load / Store Optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<LiveIntervals>(); + + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) + +char SILoadStoreOptimizer::ID = 0; + +char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; + +FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { + return new SILoadStoreOptimizer(TM); +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned Size) { + // XXX - Would the same offset be OK? Is there any reason this would happen or + // be useful? + if (Offset0 == Offset1) + return false; + + // This won't be valid if the offset isn't aligned. + if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + return false; + + unsigned EltOffset0 = Offset0 / Size; + unsigned EltOffset1 = Offset1 / Size; + + // Check if the new offsets fit in the reduced 8-bit range. + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + return true; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) + return false; + + return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize){ + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + ++MBBI; + + if (MBBI->getOpcode() != I->getOpcode()) + return E; + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) + return MBBI; + } + + return E; +} + +void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx) { + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), + E = MRI->reg_end(); I != E; ) { + MachineOperand &O = *I; + ++I; + O.substVirtReg(DstReg, SubIdx, *TRI); + } +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be careful, since the addresses could be subregisters themselves in weird + // cases, like vectors of pointers. + const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + + unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); + unsigned DestReg1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Read2Desc = TII->get(Opc); + + const TargetRegisterClass *SuperRC + = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + + DebugLoc DL = I->getDebugLoc(); + MachineInstrBuilder Read2 + = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + .addImm(0) // gds + .addOperand(*AddrReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + LIS->InsertMachineInstrInMaps(Read2); + + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); + updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); + LIS->shrinkToUses(&AddrRegLI); + + LIS->getInterval(DestReg); // Create new LI + + DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + return Read2.getInstr(); +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be sure to use .addOperand(), and not .addReg() with these. We want to be + // sure we preserve the subregister index and any register flags set on them. + const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Data1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Write2Desc = TII->get(Opc); + DebugLoc DL = I->getDebugLoc(); + + MachineInstrBuilder Write2 + = BuildMI(*MBB, I, DL, Write2Desc) + .addImm(0) // gds + .addOperand(*Addr) // addr + .addOperand(*Data0) // data0 + .addOperand(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + // XXX - How do we express subregisters here? + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + return Write2.getInstr(); +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { + MachineInstr &MI = *I; + + // Don't combine if volatile. + if (MI.hasOrderedMemoryRef()) { + ++I; + continue; + } + + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { + unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeRead2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeWrite2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } + + ++I; + } + + return Modified; +} + +bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl(); + TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo()); + TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo()); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis<LiveIntervals>(); + + DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + + assert(!MRI->isSSA()); + + bool Modified = false; + + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + + return Modified; +} diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 9f5ff29..9702565 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -49,8 +49,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -147,7 +149,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType != + if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != ShaderType::PIXEL || !shouldSkip(&MBB, &MBB.getParent()->back())) return; @@ -298,11 +300,13 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Op = MI.getOperand(0); - // Kill is only allowed in pixel / geometry shaders - assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType == - ShaderType::PIXEL || - MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType == - ShaderType::GEOMETRY); +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI + = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + // Kill is only allowed in pixel / geometry shaders. + assert(MFI->getShaderType() == ShaderType::PIXEL || + MFI->getShaderType() == ShaderType::GEOMETRY); +#endif // Clear this thread from the exec mask if the operand is negative if ((Op.isImm() || Op.isFPImm())) { @@ -440,13 +444,15 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { } bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); - TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedM0 = false; bool NeedWQM = false; + bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -463,6 +469,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { NeedWQM = true; } + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) { + NeedM0 = true; + NeedFlat = true; + } + switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: @@ -528,7 +540,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::V_INTERP_MOV_F32: NeedWQM = true; break; - } } } @@ -540,11 +551,50 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { InitM0ForLDS(MBB.getFirstNonPHI()); } - if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) { + if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); } + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + return true; } diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp index 738c90b..65b892c 100644 --- a/lib/Target/R600/SILowerI1Copies.cpp +++ b/lib/Target/R600/SILowerI1Copies.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "si-i1-copies" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineDominators.h" @@ -39,14 +40,14 @@ public: initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const override { - return "SI Lower il Copies"; + const char *getPassName() const override { + return "SI Lower i1 Copies"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -55,10 +56,10 @@ public: } // End anonymous namespace. INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, - "SI Lower il Copies", false, false) + "SI Lower i1 Copies", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, - "SI Lower il Copies", false, false) + "SI Lower i1 Copies", false, false) char SILowerI1Copies::ID = 0; @@ -70,9 +71,9 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - MF.getTarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -102,6 +103,20 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { continue; } + if (MI.getOpcode() == AMDGPU::V_XOR_I1) { + I1Defs.push_back(MI.getOperand(0).getReg()); + MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32)); + continue; + } + + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + if (RC == &AMDGPU::VReg_1RegClass) + MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); + continue; + } + if (MI.getOpcode() != AMDGPU::COPY || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg())) @@ -120,21 +135,13 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { .addOperand(MI.getOperand(0)) .addImm(0) .addImm(-1) - .addOperand(MI.getOperand(1)) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0); + .addOperand(MI.getOperand(1)); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) .addOperand(MI.getOperand(0)) - .addImm(0) .addOperand(MI.getOperand(1)) - .addImm(0) - .addImm(0) - .addImm(0) .addImm(0); MI.eraseFromParent(); } diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index e2df950..d58f31d 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -10,8 +10,10 @@ #include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -26,71 +28,49 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + TIDReg(AMDGPU::NoRegister), PSInputAddr(0), - SpillTracker() { } + NumUserSGPRs(0), + LDSWaveSpillSize(0) { } -static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { - unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( + MachineFunction *MF, + unsigned FrameIndex, + unsigned SubIdx) { + const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>( + MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + MachineRegisterInfo &MRI = MF->getRegInfo(); + int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); + Offset += SubIdx * 4; - // We need to add this register as live out for the function, in order to - // have the live range calculated directly. - // - // When register spilling begins, we have already calculated the live - // live intervals for all the registers. Since we are spilling SGPRs to - // VGPRs, we need to update the Lane VGPR's live interval every time we - // spill or restore a register. - // - // Unfortunately, there is no good way to update the live interval as - // the TargetInstrInfo callbacks for spilling and restoring don't give - // us access to the live interval information. - // - // We are lucky, though, because the InlineSpiller calls - // LiveRangeEdit::calculateRegClassAndHint() which iterates through - // all the new register that have been created when restoring a register - // and calls LiveIntervals::getInterval(), which creates and computes - // the live interval for the newly created register. However, once this - // live intervals is created, it doesn't change and since we usually reuse - // the Lane VGPR multiple times, this means any uses after the first aren't - // added to the live interval. - // - // To work around this, we add Lane VGPRs to the functions live out list, - // so that we can guarantee its live range will cover all of its uses. + unsigned LaneVGPRIdx = Offset / (64 * 4); + unsigned Lane = (Offset / 4) % 64; - for (MachineBasicBlock &MBB : *MF) { - if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) { - MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true)); - return VGPR; - } - } + struct SpilledReg Spill; - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Could not find S_ENDPGM instruction."); + if (!LaneVGPRs.count(LaneVGPRIdx)) { + unsigned LaneVGPR = TRI->findUnusedVGPR(MRI); + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + MRI.setPhysRegUsed(LaneVGPR); - return VGPR; -} - -unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes( - MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) { - unsigned StartLane = CurrentLane; - CurrentLane += NumRegs; - if (!LaneVGPR) { - LaneVGPR = createLaneVGPR(MRI, MF); - } else { - if (CurrentLane >= MAX_LANES) { - StartLane = CurrentLane = 0; - LaneVGPR = createLaneVGPR(MRI, MF); + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); + BI != BE; ++BI) { + BI->addLiveIn(LaneVGPR); } } - return StartLane; -} -void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex, - unsigned Reg, - int Lane) { - SpilledRegisters[FrameIndex] = SpilledReg(Reg, Lane); + Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; + Spill.Lane = Lane; + return Spill; } -const SIMachineFunctionInfo::SpilledReg& -SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) { - return SpilledRegisters[FrameIndex]; +unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>(); + // FIXME: We should get this information from kernel attributes if it + // is available. + return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); } diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 96e619b..6bb8f9d 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -12,10 +12,11 @@ //===----------------------------------------------------------------------===// -#ifndef SIMACHINEFUNCTIONINFO_H_ -#define SIMACHINEFUNCTIONINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" +#include "SIRegisterInfo.h" #include <map> namespace llvm { @@ -26,6 +27,9 @@ class MachineRegisterInfo; /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { void anchor() override; + + unsigned TIDReg; + public: struct SpilledReg { @@ -36,32 +40,23 @@ public: bool hasLane() { return Lane != -1;} }; - struct RegSpillTracker { - private: - unsigned CurrentLane; - std::map<unsigned, SpilledReg> SpilledRegisters; - public: - unsigned LaneVGPR; - RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { } - /// \p NumRegs The number of consecutive registers what need to be spilled. - /// This function will ensure that all registers are stored in - /// the same VGPR. - /// \returns The lane to be used for storing the first register. - unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF, - unsigned NumRegs = 1); - void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1); - const SpilledReg& getSpilledReg(unsigned FrameIndex); - bool programSpillsRegisters() { return !SpilledRegisters.empty(); } - }; - // SIMachineFunctionInfo definition SIMachineFunctionInfo(const MachineFunction &MF); + SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, + unsigned SubIdx); unsigned PSInputAddr; - struct RegSpillTracker SpillTracker; + unsigned NumUserSGPRs; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned LDSWaveSpillSize; + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; + unsigned getTIDReg() const { return TIDReg; }; + void setTIDReg(unsigned Reg) { TIDReg = Reg; } + + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; } // End namespace llvm -#endif //_SIMACHINEFUNCTIONINFO_H_ +#endif diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index d0b677a..cffea12 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -16,6 +16,12 @@ #include "SIRegisterInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" using namespace llvm; @@ -26,9 +32,19 @@ SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); + + // EXEC_LO and EXEC_HI could be allocated and used as regular register, + // but this seems likely to result in bugs, so I'm marking them as reserved. + Reserved.set(AMDGPU::EXEC_LO); + Reserved.set(AMDGPU::EXEC_HI); + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); - TII->reserveIndirectRegisters(Reserved, MF); + Reserved.set(AMDGPU::FLAT_SCR); + + // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs + Reserved.set(AMDGPU::VGPR255); + Reserved.set(AMDGPU::VGPR254); + return Reserved; } @@ -37,6 +53,213 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return RC->getNumRegs(); } +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: + return 2; + case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_RESTORE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + + MachineOperand &FIOp = MI->getOperand(FIOperandNum); + int Index = MI->getOperand(FIOperandNum).getIndex(); + + switch (MI->getOpcode()) { + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) + .addReg(SubReg) + .addImm(Spill.Lane); + + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + bool isM0 = SubReg == AMDGPU::M0; + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + if (isM0) { + SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane); + if (isM0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(SubReg); + } + } + TII->insertNOPs(MI, 3); + MI->eraseFromParent(); + break; + } + + // VGPR register spill + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned SrcReg = MI->getOperand(0).getReg(); + int64_t Offset = FrameInfo->getObjectOffset(Index); + unsigned Size = NumSubRegs * 4; + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) : + SrcReg; + Offset += (i * 4); + MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize); + + unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, + Offset, Size); + + if (AddrReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling VGPRS"); + AddrReg = AMDGPU::VGPR0; + } + + // Store the value in LDS + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32)) + .addImm(0) // gds + .addReg(AddrReg, RegState::Kill) // addr + .addReg(SubReg) // data0 + .addImm(0); // offset + } + + MI->eraseFromParent(); + break; + } + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned DstReg = MI->getOperand(0).getReg(); + int64_t Offset = FrameInfo->getObjectOffset(Index); + unsigned Size = NumSubRegs * 4; + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + + // FIXME: We could use DS_READ_B64 here to optimize for larger registers. + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) : + DstReg; + + Offset += (i * 4); + unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, + Offset, Size); + if (AddrReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling VGPRs"); + AddrReg = AMDGPU::VGPR0; + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg) + .addImm(0) // gds + .addReg(AddrReg, RegState::Kill) // addr + .addImm(0); //offset + } + MI->eraseFromParent(); + break; + } + + default: { + int64_t Offset = FrameInfo->getObjectOffset(Index); + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false); + } + } + } +} + const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { @@ -52,13 +275,17 @@ unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - const TargetRegisterClass *BaseClasses[] = { + static const TargetRegisterClass *BaseClasses[] = { &AMDGPU::VReg_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::VReg_96RegClass, + &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, - &AMDGPU::SReg_256RegClass + &AMDGPU::VReg_256RegClass, + &AMDGPU::SReg_256RegClass, + &AMDGPU::VReg_512RegClass }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -69,13 +296,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { return nullptr; } -bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) { - return false; - } - return !hasVGPRs(RC); -} - bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || @@ -122,11 +342,53 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { + + switch (Reg) { + case AMDGPU::VCC: + switch(Channel) { + case 0: return AMDGPU::VCC_LO; + case 1: return AMDGPU::VCC_HI; + default: llvm_unreachable("Invalid SubIdx for VCC"); + } + + case AMDGPU::FLAT_SCR: + switch (Channel) { + case 0: + return AMDGPU::FLAT_SCR_LO; + case 1: + return AMDGPU::FLAT_SCR_HI; + default: + llvm_unreachable("Invalid SubIdx for FLAT_SCR"); + } + break; + + case AMDGPU::EXEC: + switch (Channel) { + case 0: + return AMDGPU::EXEC_LO; + case 1: + return AMDGPU::EXEC_HI; + default: + llvm_unreachable("Invalid SubIdx for EXEC"); + } + break; + } + + const TargetRegisterClass *RC = getPhysRegClass(Reg); + // 32-bit registers don't have sub-registers, so we can just return the + // Reg. We need to have this check here, because the calculation below + // using getHWRegIndex() will fail with special 32-bit registers like + // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. + if (RC->getSize() == 4) { + assert(Channel == 0); + return Reg; + } + unsigned Index = getHWRegIndex(Reg); return SubRC->getRegister(Index + Channel); } -bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const { +bool SIRegisterInfo::regClassCanUseLiteralConstant(int RCID) const { switch (RCID) { default: return false; case AMDGPU::SSrc_32RegClassID: @@ -137,7 +399,68 @@ bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const { } } -bool SIRegisterInfo::regClassCanUseImmediate( +bool SIRegisterInfo::regClassCanUseLiteralConstant( const TargetRegisterClass *RC) const { - return regClassCanUseImmediate(RC->getID()); + return regClassCanUseLiteralConstant(RC->getID()); +} + +bool SIRegisterInfo::regClassCanUseInlineConstant(int RCID) const { + if (regClassCanUseLiteralConstant(RCID)) + return true; + + switch (RCID) { + default: return false; + case AMDGPU::VCSrc_32RegClassID: + case AMDGPU::VCSrc_64RegClassID: + return true; + } +} + +bool SIRegisterInfo::regClassCanUseInlineConstant( + const TargetRegisterClass *RC) const { + return regClassCanUseInlineConstant(RC->getID()); } + + +unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const { + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + switch (Value) { + case SIRegisterInfo::TGID_X: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + case SIRegisterInfo::TGID_Y: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + case SIRegisterInfo::TGID_Z: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + case SIRegisterInfo::SCRATCH_PTR: + return AMDGPU::SGPR2_SGPR3; + case SIRegisterInfo::INPUT_PTR: + return AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::TIDIG_X: + return AMDGPU::VGPR0; + case SIRegisterInfo::TIDIG_Y: + return AMDGPU::VGPR1; + case SIRegisterInfo::TIDIG_Z: + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected preloaded value type"); +} + +/// \brief Returns a register that is not used at any point in the function. +/// If all registers are used, then this function will return +// AMDGPU::NoRegister. +unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const { + + const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; + + for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); + I != E; ++I) { + if (!MRI.isPhysRegUsed(*I)) + return *I; + } + return AMDGPU::NoRegister; +} + diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index c9305fb..c7e54db 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// -#ifndef SIREGISTERINFO_H_ -#define SIREGISTERINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" @@ -29,6 +29,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + /// \brief get the register class of the specified type to use in the /// CFGStructurizer const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; @@ -40,7 +46,20 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; /// \returns true if this class contains only SGPR registers - bool isSGPRClass(const TargetRegisterClass *RC) const; + bool isSGPRClass(const TargetRegisterClass *RC) const { + if (!RC) + return false; + + return !hasVGPRs(RC); + } + + /// \returns true if this class ID contains only SGPR registers + bool isSGPRClassID(unsigned RCID) const { + if (static_cast<int>(RCID) == -1) + return false; + + return isSGPRClass(getRegClass(RCID)); + } /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; @@ -62,14 +81,41 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned Channel) const; /// \returns True if operands defined with this register class can accept - /// inline immediates. - bool regClassCanUseImmediate(int RCID) const; + /// a literal constant (i.e. any 32-bit immediate). + bool regClassCanUseLiteralConstant(int RCID) const; + + /// \returns True if operands defined with this register class can accept + /// a literal constant (i.e. any 32-bit immediate). + bool regClassCanUseLiteralConstant(const TargetRegisterClass *RC) const; + + /// \returns True if operands defined with this register class can accept + /// an inline constant. i.e. An integer value in the range (-16, 64) or + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + bool regClassCanUseInlineConstant(int RCID) const; /// \returns True if operands defined with this register class can accept - /// inline immediates. - bool regClassCanUseImmediate(const TargetRegisterClass *RC) const; + /// a literal constant. i.e. A value in the range (-16, 64). + bool regClassCanUseInlineConstant(const TargetRegisterClass *RC) const; + + enum PreloadedValue { + TGID_X, + TGID_Y, + TGID_Z, + SCRATCH_WAVE_OFFSET, + SCRATCH_PTR, + INPUT_PTR, + TIDIG_X, + TIDIG_Y, + TIDIG_Z + }; + + /// \brief Returns the physical register that \p Value is stored in. + unsigned getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const; + + unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const; }; } // End namespace llvm -#endif // SIREGISTERINFO_H_ +#endif diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 8974b63..45c2b41 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -27,10 +27,28 @@ def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> { let HWEncoding = 106; } -def EXEC : SIReg<"EXEC", 126>; +def EXEC_LO : SIReg<"exec_lo", 126>; +def EXEC_HI : SIReg<"exec_hi", 127>; + +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 126; +} + def SCC : SIReg<"SCC", 253>; def M0 : SIReg <"M0", 124>; +def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. +def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. + +// Pair to indicate location of scratch space for flat accesses. +def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 104; +} + // SGPR registers foreach Index = 0-101 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; @@ -152,20 +170,24 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, //===----------------------------------------------------------------------===// // Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>; +def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { + let CopyCost = -1; // Theoretically it is possible to read from SCC, + // but it should never be necessary. +} + def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>; // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, - (add SGPR_32, M0Reg, VCC_LO) + (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>; def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, - (add SGPR_64Regs, VCCReg, EXECReg) + (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) >; def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; @@ -192,18 +214,30 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>; //===----------------------------------------------------------------------===// -// [SV]Src_(32|64) register classes, can have either an immediate or an register +// SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>; +//===----------------------------------------------------------------------===// +// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; //===----------------------------------------------------------------------===// +// VCSrc_* Operands with an SGPR, VGPR or an inline constant +//===----------------------------------------------------------------------===// + +def VCSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; + +def VCSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; + +//===----------------------------------------------------------------------===// // SGPR and VGPR register classes //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp new file mode 100644 index 0000000..45e83f5 --- /dev/null +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -0,0 +1,271 @@ +//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass tries to use the 32-bit encoding for instructions when possible. +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-shrink-instructions" + +STATISTIC(NumInstructionsShrunk, + "Number of 64-bit instruction reduced to 32-bit."); +STATISTIC(NumLiteralConstantsFolded, + "Number of literal constants folded into 32-bit instructions."); + +namespace llvm { + void initializeSIShrinkInstructionsPass(PassRegistry&); +} + +using namespace llvm; + +namespace { + +class SIShrinkInstructions : public MachineFunctionPass { +public: + static char ID; + +public: + SIShrinkInstructions() : MachineFunctionPass(ID) { + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Shrink Instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SIShrinkInstructions::ID = 0; + +FunctionPass *llvm::createSIShrinkInstructionsPass() { + return new SIShrinkInstructions(); +} + +static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (!MO->isReg()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) + return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); + + return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); +} + +static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + if (Src2) + return false; + + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) + return false; + + // We don't need to check src0, all input types are legal, so just make sure + // src0 isn't using any modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) + return false; + + // Check output modifiers + if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + return true; +} + +/// \brief This function checks \p MI for operands defined by a move immediate +/// instruction and then folds the literal constant into the instruction if it +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction +/// and will only fold literal constants if we are still in SSA. +static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, + MachineRegisterInfo &MRI, bool TryToCommute = true) { + + if (!MRI.isSSA()) + return; + + assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || + TII->isVOPC(MI.getOpcode())); + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + + // Only one literal constant is allowed per instruction, so if src0 is a + // literal constant then we can't do any folding. + if ((Src0->isImm() || Src0->isFPImm()) && TII->isLiteralConstant(*Src0)) + return; + + + // Literal constants and SGPRs can only be used in Src0, so if Src0 is an + // SGPR, we cannot commute the instruction, so we can't fold any literal + // constants. + if (Src0->isReg() && !isVGPR(Src0, TRI, MRI)) + return; + + // Try to fold Src0 + if (Src0->isReg()) { + unsigned Reg = Src0->getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + Src0->ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } else if (MovSrc.isFPImm()) { + const ConstantFP *CFP = MovSrc.getFPImm(); + if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle) { + Src0->ChangeToFPImmediate(CFP); + ConstantFolded = true; + } + } + if (ConstantFolded) { + if (MRI.use_empty(Reg)) + Def->eraseFromParent(); + ++NumLiteralConstantsFolded; + return; + } + } + } + + // We have failed to fold src0, so commute the instruction and try again. + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + foldImmediates(MI, TII, MRI, false); + +} + +bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector<unsigned> I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineOperand &Src = MI.getOperand(1); + + // TODO: Handle FPImm? + if (Src.isImm()) { + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) { + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + continue; + } + } + } + + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + if (!canShrink(MI, TII, TRI, MRI)) { + // Try commuting the instruction and see if that enables us to shrink + // it. + if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + !canShrink(MI, TII, TRI, MRI)) + continue; + } + + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + + // Op32 could be -1 here if we started with an instruction that had a + // a 32-bit encoding and then commuted it to an instruction that did not. + if (Op32 == -1) + continue; + + if (TII->isVOPC(Op32)) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because the register allocator has + // trouble with sequences like this, which cause the allocator to run + // out of registers if vreg0 and vreg1 belong to the VCCReg register + // class: + // vreg0 = VOPC; + // vreg1 = VOPC; + // S_AND_B64 vreg0, vreg1 + // + // So, instead of forcing the instruction to write to VCC, we provide + // a hint to the register allocator to use VCC and then we we will run + // this pass again after RA and shrink it if it outputs to VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + continue; + } + if (DstReg != AMDGPU::VCC) + continue; + } + + // We can shrink this instruction + DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); + + MachineInstrBuilder Inst32 = + BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); + + // dst + Inst32.addOperand(MI.getOperand(0)); + + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = + TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + Inst32.addOperand(*Src1); + + ++NumInstructionsShrunk; + MI.eraseFromParent(); + + foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); + + + } + } + return false; +} diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index 367963a..9318dc1 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -87,7 +87,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) { Value *BitCast = Builder.CreateBitCast(Ptr, PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector <std::pair<unsigned, MDNode*>, 8> MD; + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; I.getAllMetadataOtherThanDebugLoc(MD); for (unsigned i = 0, e = MD.size(); i != e; ++i) { Load->setMetadata(MD[i].first, MD[i].second); |