aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPU.h16
-rw-r--r--lib/Target/R600/AMDGPU.td17
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp288
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.h31
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp231
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp480
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h40
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.cpp40
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.h11
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.td18
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td73
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.cpp51
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.h13
-rw-r--r--lib/Target/R600/AMDGPUMachineFunction.cpp4
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.cpp3
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp75
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h42
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp217
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.h45
-rw-r--r--lib/Target/R600/AMDGPUTargetTransformInfo.cpp94
-rw-r--r--lib/Target/R600/AMDGPUTargetTransformInfo.h78
-rw-r--r--lib/Target/R600/AMDKernelCodeT.h704
-rw-r--r--lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp32
-rw-r--r--lib/Target/R600/CIInstructions.td42
-rw-r--r--lib/Target/R600/CMakeLists.txt2
-rw-r--r--lib/Target/R600/CaymanInstructions.td4
-rw-r--r--lib/Target/R600/EvergreenInstructions.td14
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp71
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h3
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp2
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp1
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp10
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h1
-rw-r--r--lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp4
-rw-r--r--lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp149
-rw-r--r--lib/Target/R600/Processors.td38
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp46
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp62
-rw-r--r--lib/Target/R600/R600ISelLowering.h2
-rw-r--r--lib/Target/R600/R600Instructions.td36
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp7
-rw-r--r--lib/Target/R600/R600Packetizer.cpp2
-rw-r--r--lib/Target/R600/R700Instructions.td2
-rw-r--r--lib/Target/R600/SIAnnotateControlFlow.cpp27
-rw-r--r--lib/Target/R600/SIDefines.h72
-rw-r--r--lib/Target/R600/SIFixSGPRCopies.cpp36
-rw-r--r--lib/Target/R600/SIFoldOperands.cpp287
-rw-r--r--lib/Target/R600/SIISelLowering.cpp867
-rw-r--r--lib/Target/R600/SIISelLowering.h20
-rw-r--r--lib/Target/R600/SIInsertWaits.cpp117
-rw-r--r--lib/Target/R600/SIInstrFormats.td429
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp621
-rw-r--r--lib/Target/R600/SIInstrInfo.h128
-rw-r--r--lib/Target/R600/SIInstrInfo.td1442
-rw-r--r--lib/Target/R600/SIInstructions.td1846
-rw-r--r--lib/Target/R600/SILoadStoreOptimizer.cpp43
-rw-r--r--lib/Target/R600/SILowerControlFlow.cpp44
-rw-r--r--lib/Target/R600/SILowerI1Copies.cpp78
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.cpp9
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.h4
-rw-r--r--lib/Target/R600/SIPrepareScratchRegs.cpp208
-rw-r--r--lib/Target/R600/SIRegisterInfo.cpp247
-rw-r--r--lib/Target/R600/SIRegisterInfo.h39
-rw-r--r--lib/Target/R600/SIRegisterInfo.td50
-rw-r--r--lib/Target/R600/SISchedule.td80
-rw-r--r--lib/Target/R600/SIShrinkInstructions.cpp37
-rw-r--r--lib/Target/R600/SITypeRewriter.cpp3
-rw-r--r--lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp6
-rw-r--r--lib/Target/R600/VIInstrFormats.td166
-rw-r--r--lib/Target/R600/VIInstructions.td25
70 files changed, 6821 insertions, 3211 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 261075e..fb87cc5 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
// SI Passes
FunctionPass *createSITypeRewriter();
FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
@@ -46,6 +47,10 @@ FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
FunctionPass *createSIFixSGPRLiveRangesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIPrepareScratchRegs();
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
@@ -59,19 +64,20 @@ Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
ModulePass *createAMDGPUAlwaysInlinePass();
-/// \brief Creates an AMDGPU-specific Target Transformation Info pass.
-ImmutablePass *
-createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM);
-
void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
extern char &SIFixSGPRLiveRangesID;
extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
namespace AMDGPU {
enum TargetIndex {
- TI_CONSTDATA_START
+ TI_CONSTDATA_START,
+ TI_SCRATCH_RSRC_DWORD0,
+ TI_SCRATCH_RSRC_DWORD1,
+ TI_SCRATCH_RSRC_DWORD2,
+ TI_SCRATCH_RSRC_DWORD3
};
}
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 4cf1243..a7d48b3 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -48,6 +48,12 @@ def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
"Enable double precision denormal handling",
[FeatureFP64]>;
+def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
+ "FastFMAF32",
+ "true",
+ "Assuming f32 fma is at least as fast as mul + add",
+ []>;
+
// Some instructions do not support denormals despite this flag. Using
// fp32 denormals also causes instructions to run at the double
// precision rate for the device.
@@ -92,6 +98,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"true",
"Support flat address space">;
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+ "EnableVGPRSpilling",
+ "true",
+ "Enable spilling of VGPRs to scratch memory">;
+
class SubtargetFeatureFetchLimit <string Value> :
SubtargetFeature <"fetch"#Value,
"TexVTXClauseSize",
@@ -147,10 +158,16 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
[Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
+
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
let guessInstructionProperties = 1;
+ let noNamedPositionallyEncodedOperands = 1;
}
def AMDGPUAsmParser : AsmParser {
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 5511d7c..92bc314 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
@@ -57,7 +58,7 @@ using namespace llvm;
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
static uint32_t getFPMode(const MachineFunction &F) {
- const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
+ const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
// TODO: Is there any real use for the flush in only / flush out only modes?
uint32_t FP32Denormals =
@@ -72,19 +73,20 @@ static uint32_t getFPMode(const MachineFunction &F) {
FP_DENORM_MODE_DP(FP64Denormals);
}
-static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
- MCStreamer &Streamer) {
- return new AMDGPUAsmPrinter(tm, Streamer);
+static AsmPrinter *
+createAMDGPUAsmPrinterPass(TargetMachine &tm,
+ std::unique_ptr<MCStreamer> &&Streamer) {
+ return new AMDGPUAsmPrinter(tm, std::move(Streamer));
}
extern "C" void LLVMInitializeR600AsmPrinter() {
TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
}
-AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
- : AsmPrinter(TM, Streamer) {
- DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
-}
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
@@ -106,14 +108,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitFunctionHeader();
MCContext &Context = getObjFileLowering().getContext();
- const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
- ELF::SHT_PROGBITS, 0,
- SectionKind::getReadOnly());
+ const MCSectionELF *ConfigSection =
+ Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
OutStreamer.SwitchSection(ConfigSection);
- const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
- if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ if (STM.isAmdHsaOS()) {
+ getSIProgramInfo(KernelInfo, MF);
+ EmitAmdKernelCodeT(MF, KernelInfo);
+ OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1));
+ } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
getSIProgramInfo(KernelInfo, MF);
EmitProgramInfoSI(MF, KernelInfo);
} else {
@@ -128,10 +133,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitFunctionBody();
if (isVerbose()) {
- const MCSectionELF *CommentSection
- = Context.getELFSection(".AMDGPU.csdata",
- ELF::SHT_PROGBITS, 0,
- SectionKind::getReadOnly());
+ const MCSectionELF *CommentSection =
+ Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer.SwitchSection(CommentSection);
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
@@ -156,22 +159,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
if (STM.dumpCode()) {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- MF.dump();
-#endif
- if (DisasmEnabled) {
- OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
- ELF::SHT_NOTE, 0,
- SectionKind::getReadOnly()));
+ OutStreamer.SwitchSection(
+ Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
- for (size_t i = 0; i < DisasmLines.size(); ++i) {
- std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
- Comment += " ; " + HexLines[i] + "\n";
+ for (size_t i = 0; i < DisasmLines.size(); ++i) {
+ std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+ Comment += " ; " + HexLines[i] + "\n";
- OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
- OutStreamer.EmitBytes(StringRef(Comment));
- }
+ OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+ OutStreamer.EmitBytes(StringRef(Comment));
}
}
@@ -181,10 +178,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
unsigned MaxGPR = 0;
bool killPixel = false;
- const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const R600RegisterInfo *RI =
+ static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
- const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
@@ -240,13 +237,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) const {
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
uint64_t CodeSize = 0;
unsigned MaxSGPR = 0;
unsigned MaxVGPR = 0;
bool VCCUsed = false;
bool FlatUsed = false;
- const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
+ const SIRegisterInfo *RI =
+ static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
@@ -285,7 +284,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (AMDGPU::SReg_32RegClass.contains(reg)) {
isSGPR = true;
width = 1;
- } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
+ } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
isSGPR = false;
width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(reg)) {
@@ -340,6 +339,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.NumVGPR = MaxVGPR + 1;
ProgInfo.NumSGPR = MaxSGPR + 1;
+ ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
+ ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
ProgInfo.FloatMode = getFPMode(MF);
@@ -356,21 +357,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.FlatUsed = FlatUsed;
ProgInfo.VCCUsed = VCCUsed;
ProgInfo.CodeLen = CodeSize;
-}
-
-void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
- const SIProgramInfo &KernelInfo) {
- const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- unsigned RsrcReg;
- switch (MFI->getShaderType()) {
- default: // Fall through
- case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
- case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
- case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
- case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
- }
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
@@ -384,59 +370,203 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
MFI->getMaximumWorkGroupSize(MF);
- unsigned LDSBlocks =
- RoundUpToAlignment(MFI->LDSSize + LDSSpillSize,
- 1 << LDSAlignShift) >> LDSAlignShift;
+ ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
+ ProgInfo.LDSBlocks =
+ RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
// Scratch is allocated in 256 dword blocks.
unsigned ScratchAlignShift = 10;
// We need to program the hardware with the amount of scratch memory that
- // is used by the entire wave. KernelInfo.ScratchSize is the amount of
+ // is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
- unsigned ScratchBlocks =
- RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
+ ProgInfo.ScratchBlocks =
+ RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
1 << ScratchAlignShift) >> ScratchAlignShift;
- unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4;
- unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8;
+ ProgInfo.ComputePGMRSrc1 =
+ S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+ S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+ S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+ S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+ S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+ ProgInfo.ComputePGMRSrc2 =
+ S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+ S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
+ S_00B84C_TGID_X_EN(1) |
+ S_00B84C_TGID_Y_EN(1) |
+ S_00B84C_TGID_Z_EN(1) |
+ S_00B84C_TG_SIZE_EN(1) |
+ S_00B84C_TIDIG_COMP_CNT(2) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+}
+
+static unsigned getRsrcReg(unsigned ShaderType) {
+ switch (ShaderType) {
+ default: // Fall through
+ case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1;
+ case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+ case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+ case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+ }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) {
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
if (MFI->getShaderType() == ShaderType::COMPUTE) {
OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
- const uint32_t ComputePGMRSrc1 =
- S_00B848_VGPRS(VGPRBlocks) |
- S_00B848_SGPRS(SGPRBlocks) |
- S_00B848_PRIORITY(KernelInfo.Priority) |
- S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
- S_00B848_PRIV(KernelInfo.Priv) |
- S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
- S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
- S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
-
- OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+ OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
- const uint32_t ComputePGMRSrc2 =
- S_00B84C_LDS_SIZE(LDSBlocks) |
- S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
-
- OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
+ OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
- OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
+ OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
} else {
OutStreamer.EmitIntValue(RsrcReg, 4);
- OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) |
- S_00B028_SGPRS(SGPRBlocks), 4);
+ OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+ S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+ if (STM.isVGPRSpillingEnabled(MFI)) {
+ OutStreamer.EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+ OutStreamer.EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+ }
}
if (MFI->getShaderType() == ShaderType::PIXEL) {
OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
- OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
+ OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
}
}
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ amd_kernel_code_t header;
+
+ memset(&header, 0, sizeof(header));
+
+ header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
+ header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
+
+ header.struct_byte_size = sizeof(amd_kernel_code_t);
+
+ header.target_chip = STM.getAmdKernelCodeChipID();
+
+ header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
+
+ header.compute_pgm_resource_registers =
+ KernelInfo.ComputePGMRSrc1 |
+ (KernelInfo.ComputePGMRSrc2 << 32);
+
+ // Code Properties:
+ header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
+ AMD_CODE_PROPERTY_IS_PTR64;
+
+ if (KernelInfo.FlatUsed)
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ if (KernelInfo.ScratchBlocks)
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
+ header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+ header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+
+ // MFI->ABIArgOffset is the number of bytes for the kernel arguments
+ // plus 36. 36 is the number of bytes reserved at the begining of the
+ // input buffer to store work-group size information.
+ // FIXME: We should be adding the size of the implicit arguments
+ // to this value.
+ header.kernarg_segment_byte_size = MFI->ABIArgOffset;
+
+ header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+ header.workitem_vgpr_count = KernelInfo.NumVGPR;
+
+ // FIXME: What values do I put for these alignments
+ header.kernarg_segment_alignment = 0;
+ header.group_segment_alignment = 0;
+ header.private_segment_alignment = 0;
+
+ header.code_type = 1; // HSA_EXT_CODE_KERNEL
+
+ header.wavefront_size = STM.getWavefrontSize();
+
+ const MCSectionELF *VersionSection =
+ OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0);
+ OutStreamer.SwitchSection(VersionSection);
+ OutStreamer.EmitBytes(Twine("HSA Code Unit:" +
+ Twine(header.hsail_version_major) + "." +
+ Twine(header.hsail_version_minor) + ":" +
+ "AMD:" +
+ Twine(header.amd_code_version_major) + "." +
+ Twine(header.amd_code_version_minor) + ":" +
+ "GFX8.1:0").str());
+
+ OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+ if (isVerbose()) {
+ OutStreamer.emitRawComment("amd_code_version_major = " +
+ Twine(header.amd_code_version_major), false);
+ OutStreamer.emitRawComment("amd_code_version_minor = " +
+ Twine(header.amd_code_version_minor), false);
+ OutStreamer.emitRawComment("struct_byte_size = " +
+ Twine(header.struct_byte_size), false);
+ OutStreamer.emitRawComment("target_chip = " +
+ Twine(header.target_chip), false);
+ OutStreamer.emitRawComment(" compute_pgm_rsrc1: " +
+ Twine::utohexstr(KernelInfo.ComputePGMRSrc1), false);
+ OutStreamer.emitRawComment(" compute_pgm_rsrc2: " +
+ Twine::utohexstr(KernelInfo.ComputePGMRSrc2), false);
+ OutStreamer.emitRawComment("enable_sgpr_private_segment_buffer = " +
+ Twine((bool)(header.code_properties &
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
+ OutStreamer.emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
+ Twine((bool)(header.code_properties &
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
+ OutStreamer.emitRawComment("private_element_size = 2 ", false);
+ OutStreamer.emitRawComment("is_ptr64 = " +
+ Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
+ OutStreamer.emitRawComment("workitem_private_segment_byte_size = " +
+ Twine(header.workitem_private_segment_byte_size),
+ false);
+ OutStreamer.emitRawComment("workgroup_group_segment_byte_size = " +
+ Twine(header.workgroup_group_segment_byte_size),
+ false);
+ OutStreamer.emitRawComment("gds_segment_byte_size = " +
+ Twine(header.gds_segment_byte_size), false);
+ OutStreamer.emitRawComment("kernarg_segment_byte_size = " +
+ Twine(header.kernarg_segment_byte_size), false);
+ OutStreamer.emitRawComment("wavefront_sgpr_count = " +
+ Twine(header.wavefront_sgpr_count), false);
+ OutStreamer.emitRawComment("workitem_vgpr_count = " +
+ Twine(header.workitem_vgpr_count), false);
+ OutStreamer.emitRawComment("code_type = " + Twine(header.code_type), false);
+ OutStreamer.emitRawComment("wavefront_size = " +
+ Twine((int)header.wavefront_size), false);
+ OutStreamer.emitRawComment("optimization_level = " +
+ Twine(header.optimization_level), false);
+ OutStreamer.emitRawComment("hsail_profile = " +
+ Twine(header.hsail_profile), false);
+ OutStreamer.emitRawComment("hsail_machine_model = " +
+ Twine(header.hsail_machine_model), false);
+ OutStreamer.emitRawComment("hsail_version_major = " +
+ Twine(header.hsail_version_major), false);
+ OutStreamer.emitRawComment("hsail_version_minor = " +
+ Twine(header.hsail_version_minor), false);
+ }
+
+ OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header)));
+}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index b9a0767..58ffb1e 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter {
private:
struct SIProgramInfo {
SIProgramInfo() :
- NumVGPR(0),
- NumSGPR(0),
+ VGPRBlocks(0),
+ SGPRBlocks(0),
Priority(0),
FloatMode(0),
Priv(0),
@@ -33,13 +33,19 @@ private:
DebugMode(0),
IEEEMode(0),
ScratchSize(0),
+ ComputePGMRSrc1(0),
+ LDSBlocks(0),
+ ScratchBlocks(0),
+ ComputePGMRSrc2(0),
+ NumVGPR(0),
+ NumSGPR(0),
FlatUsed(false),
VCCUsed(false),
CodeLen(0) {}
// Fields set in PGM_RSRC1 pm4 packet.
- uint32_t NumVGPR;
- uint32_t NumSGPR;
+ uint32_t VGPRBlocks;
+ uint32_t SGPRBlocks;
uint32_t Priority;
uint32_t FloatMode;
uint32_t Priv;
@@ -48,6 +54,17 @@ private:
uint32_t IEEEMode;
uint32_t ScratchSize;
+ uint64_t ComputePGMRSrc1;
+
+ // Fields set in PGM_RSRC2 pm4 packet.
+ uint32_t LDSBlocks;
+ uint32_t ScratchBlocks;
+
+ uint64_t ComputePGMRSrc2;
+
+ uint32_t NumVGPR;
+ uint32_t NumSGPR;
+ uint32_t LDSSize;
bool FlatUsed;
// Bonus information for debugging.
@@ -64,9 +81,12 @@ private:
/// can correctly setup the GPU state.
void EmitProgramInfoR600(const MachineFunction &MF);
void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+ void EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const;
public:
- explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
+ explicit AMDGPUAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -80,7 +100,6 @@ public:
void EmitEndOfAsmFile(Module &M) override;
protected:
- bool DisasmEnabled;
std::vector<std::string> DisasmLines, HexLines;
size_t DisasmLineMaxLen;
};
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index 90b6672..b5ab703 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -39,11 +39,11 @@ namespace {
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for different targets.
- const AMDGPUSubtarget &Subtarget;
+ const AMDGPUSubtarget *Subtarget;
public:
AMDGPUDAGToDAGISel(TargetMachine &TM);
virtual ~AMDGPUDAGToDAGISel();
-
+ bool runOnMachineFunction(MachineFunction &MF) override;
SDNode *Select(SDNode *N) override;
const char *getPassName() const override;
void PostprocessISelDAG() override;
@@ -95,9 +95,9 @@ private:
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
- SDValue &Offset) const;
+ SDValue &SOffset, SDValue &Offset) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &Offset,
+ SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
@@ -113,6 +113,9 @@ private:
bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Omod) const;
+ bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const;
SDNode *SelectADD_SUB_I64(SDNode *N);
SDNode *SelectDIV_SCALE(SDNode *N);
@@ -129,7 +132,11 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
}
AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
- : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
+ : SelectionDAGISel(TM) {}
+
+bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
+ return SelectionDAGISel::runOnMachineFunction(MF);
}
AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
@@ -153,7 +160,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
switch (N->getMachineOpcode()) {
default: {
const MCInstrDesc &Desc =
- TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode());
+ Subtarget->getInstrInfo()->get(N->getMachineOpcode());
unsigned OpIdx = Desc.getNumDefs() + OpNo;
if (OpIdx >= Desc.getNumOperands())
return nullptr;
@@ -161,17 +168,17 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
if (RegClass == -1)
return nullptr;
- return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass);
+ return Subtarget->getRegisterInfo()->getRegClass(RegClass);
}
case AMDGPU::REG_SEQUENCE: {
unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
const TargetRegisterClass *SuperRC =
- TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID);
+ Subtarget->getRegisterInfo()->getRegClass(RCID);
SDValue SubRegOp = N->getOperand(OpNo + 1);
unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
- return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg(
- SuperRC, SubRegIdx);
+ return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
+ SubRegIdx);
}
}
}
@@ -241,7 +248,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return nullptr; // Already selected.
}
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
switch (Opc) {
default: break;
// We are selecting i64 ADD here instead of custom lower it during
@@ -250,7 +256,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::ADD:
case ISD::SUB: {
if (N->getValueType(0) != MVT::i64 ||
- ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
return SelectADD_SUB_I64(N);
@@ -259,15 +265,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
unsigned RegClassID;
- const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsEq(MVT::i32));
- if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
bool UseVReg = true;
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
U != E; ++U) {
@@ -278,12 +281,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (!RC) {
continue;
}
- if (SIRI->isSGPRClass(RC)) {
+ if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
UseVReg = false;
}
}
switch(NumVectorElts) {
- case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+ case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
AMDGPU::SReg_32RegClassID;
break;
case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
@@ -365,7 +368,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
break;
}
if (N->getValueType(0) == MVT::i128) {
@@ -387,8 +390,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant:
case ISD::ConstantFP: {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
- if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
break;
@@ -414,8 +416,55 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
N->getValueType(0), Ops);
}
+ case ISD::LOAD: {
+ // To simplify the TableGen patters, we replace all i64 loads with
+ // v2i32 loads. Alternatively, we could promote i64 loads to v2i32
+ // during DAG legalization, however, so places (ExpandUnalignedLoad)
+ // in the DAG legalizer assume that if i64 is legal, so doing this
+ // promotion early can cause problems.
+ EVT VT = N->getValueType(0);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+ break;
+
+ SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
+ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+ MVT::i64, NewLoad);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
+ SelectCode(NewLoad.getNode());
+ N = BitCast.getNode();
+ break;
+ }
+
+ case ISD::STORE: {
+ // Handle i64 stores here for the same reason mentioned above for loads.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ SDValue Value = ST->getValue();
+ if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+ break;
+
+ SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+ MVT::v2i32, Value);
+ SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
+ ST->getBasePtr(), ST->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
+
+ if (NewValue.getOpcode() == ISD::BITCAST) {
+ Select(NewStore.getNode());
+ return SelectCode(NewValue.getNode());
+ }
+
+ // getNode() may fold the bitcast if its input was another bitcast. If that
+ // happens we should only select the new store.
+ N = NewStore.getNode();
+ break;
+ }
+
case AMDGPUISD::REGISTER_LOAD: {
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
@@ -431,7 +480,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
Ops);
}
case AMDGPUISD::REGISTER_STORE: {
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
SelectADDRIndirect(N->getOperand(2), Addr, Offset);
@@ -449,7 +498,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
- if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
// There is a scalar version available, but unlike the vector version which
@@ -554,13 +603,11 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
}
bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
- if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
- if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
- N->getMemoryVT().bitsLT(MVT::i32)) {
+ if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ N->getMemoryVT().bitsLT(MVT::i32))
return true;
- }
- }
+
return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
}
@@ -736,6 +783,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
}
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
SDLoc SL(N);
EVT VT = N->getValueType(0);
@@ -745,30 +794,22 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
unsigned Opc
= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
- const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
- const SDValue False = CurDAG->getTargetConstant(0, MVT::i1);
- SDValue Ops[] = {
- Zero, // src0_modifiers
- N->getOperand(0), // src0
- Zero, // src1_modifiers
- N->getOperand(1), // src1
- Zero, // src2_modifiers
- N->getOperand(2), // src2
- False, // clamp
- Zero // omod
- };
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ SDValue Ops[8];
+ SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
}
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
unsigned OffsetBits) const {
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
(OffsetBits == 8 && !isUInt<8>(Offset)))
return false;
- if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
return true;
// On Southern Islands instruction with a negative base value and an offset
@@ -879,26 +920,32 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (isLegalMUBUFImmOffset(C1)) {
-
- if (N0.getOpcode() == ISD::ADD) {
- // (add (add N2, N3), C1) -> addr64
- SDValue N2 = N0.getOperand(0);
- SDValue N3 = N0.getOperand(1);
- Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
- Ptr = N2;
- VAddr = N3;
- Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
- return;
- }
+ if (N0.getOpcode() == ISD::ADD) {
+ // (add (add N2, N3), C1) -> addr64
+ SDValue N2 = N0.getOperand(0);
+ SDValue N3 = N0.getOperand(1);
+ Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
+ Ptr = N2;
+ VAddr = N3;
+ } else {
// (add N0, C1) -> offset
VAddr = CurDAG->getTargetConstant(0, MVT::i32);
Ptr = N0;
- Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ }
+
+ if (isLegalMUBUFImmOffset(C1)) {
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ return;
+ } else if (isUInt<32>(C1->getZExtValue())) {
+ // Illegal offset, store it in soffset.
+ Offset = CurDAG->getTargetConstant(0, MVT::i16);
+ SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i32)), 0);
return;
}
}
+
if (Addr.getOpcode() == ISD::ADD) {
// (add N0, N1) -> addr64
SDValue N0 = Addr.getOperand(0);
@@ -918,9 +965,9 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr,
+ SDValue &VAddr, SDValue &SOffset,
SDValue &Offset) const {
- SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE;
+ SDValue Ptr, Offen, Idxen, Addr64, GLC, SLC, TFE;
SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
GLC, SLC, TFE);
@@ -940,11 +987,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &Offset,
- SDValue &SLC) const {
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &Offset,
+ SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, MVT::i1);
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset);
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
@@ -954,21 +1002,32 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
const SITargetLowering& Lowering =
*static_cast<const SITargetLowering*>(getTargetLowering());
- unsigned ScratchPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
unsigned ScratchOffsetReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
ScratchOffsetReg, MVT::i32);
+ SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
+ SDValue ScratchRsrcDword0 =
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+
+ SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
+ SDValue ScratchRsrcDword1 =
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
- SDValue ScratchPtr =
- CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
- MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64);
+ const SDValue RsrcOps[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+ ScratchRsrcDword0,
+ CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+ ScratchRsrcDword1,
+ CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+ };
+ SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, RsrcOps), 0);
Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
@@ -985,22 +1044,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
}
}
- // (add FI, n0)
- if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
- isa<FrameIndexSDNode>(Addr.getOperand(0))) {
- VAddr = Addr.getOperand(1);
- ImmOffset = Addr.getOperand(0);
- return true;
- }
-
- // (FI)
- if (isa<FrameIndexSDNode>(Addr)) {
- VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
- CurDAG->getConstant(0, MVT::i32)), 0);
- ImmOffset = Addr;
- return true;
- }
-
// (node)
VAddr = Addr;
ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
@@ -1012,6 +1055,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &GLC, SDValue &SLC,
SDValue &TFE) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
GLC, SLC, TFE);
@@ -1019,7 +1064,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
!cast<ConstantSDNode>(Idxen)->getSExtValue() &&
!cast<ConstantSDNode>(Addr64)->getSExtValue()) {
- uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT |
+ uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
APInt::getAllOnesValue(32).getZExtValue(); // Size
SDLoc DL(Addr);
@@ -1045,7 +1090,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
SDLoc DL(N);
- assert(Subtarget.hasFlatAddressSpace() &&
+ assert(Subtarget->hasFlatAddressSpace() &&
"addrspacecast only supported with flat address space!");
assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
@@ -1081,7 +1126,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
if (DestSize > SrcSize) {
assert(SrcSize == 32 && DestSize == 64);
- SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+ // FIXME: This is probably wrong, we should never be defining
+ // a register class with both VGPRs and SGPRs
+ SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, MVT::i32);
const SDValue Ops[] = {
RC,
@@ -1141,6 +1188,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const {
+ Clamp = Omod = CurDAG->getTargetConstant(0, MVT::i32);
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 2f95b74..4707279 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -102,11 +102,9 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
- TargetLowering(TM) {
-
- Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
-
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+ const AMDGPUSubtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
setOperationAction(ISD::Constant, MVT::i32, Legal);
setOperationAction(ISD::Constant, MVT::i64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -127,12 +125,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FABS, MVT::f32, Legal);
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ // Expand to fneg + fadd.
+ setOperationAction(ISD::FSUB, MVT::f64, Expand);
+
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -141,9 +148,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v2f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
- setOperationAction(ISD::STORE, MVT::i64, Promote);
- AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
-
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -162,9 +166,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
// Custom lowering of vector stores is required for local address space
// stores.
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
- // XXX: Native v2i32 local address space stores are possible, but not
- // currently implemented.
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
@@ -187,9 +188,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
- setOperationAction(ISD::LOAD, MVT::i64, Promote);
- AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
-
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
@@ -216,18 +214,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+ // There are no 64-bit extloads. These should be done as a 32-bit extload and
+ // an extension to 64-bit.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+ }
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -246,7 +254,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -382,6 +391,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
@@ -397,6 +412,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
// large sequence of instructions.
setIntDivIsCheap(false);
setPow2SDivIsCheap(false);
+ setFsqrtIsCheap(true);
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
@@ -429,6 +445,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
}
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+ ISD::LoadExtType,
+ EVT NewVT) const {
+
+ unsigned NewSize = NewVT.getStoreSizeInBits();
+
+ // If we are reducing to a 32-bit load, this is always better.
+ if (NewSize == 32)
+ return true;
+
+ EVT OldVT = N->getValueType(0);
+ unsigned OldSize = OldVT.getStoreSizeInBits();
+
+ // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+ // extloads, so doing one requires using a buffer_load. In cases where we
+ // still couldn't use a scalar load, using the wider load shouldn't really
+ // hurt anything.
+
+ // If the old size already had to be an extload, there's no harm in continuing
+ // to reduce the width.
+ return (OldSize < 32);
+}
+
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
EVT CastTy) const {
if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -442,6 +481,18 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
(LScalarSize < 32));
}
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+ return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+ return true;
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -560,6 +611,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
case ISD::FRINT: return LowerFRINT(Op, DAG);
case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+ case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
@@ -619,7 +671,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
const SDValue &InitPtr,
SDValue Chain,
SelectionDAG &DAG) const {
- const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+ const DataLayout *TD = getDataLayout();
SDLoc DL(InitPtr);
Type *InitTy = Init->getType();
@@ -707,7 +759,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
SDValue Op,
SelectionDAG &DAG) const {
- const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+ const DataLayout *TD = getDataLayout();
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
@@ -810,8 +862,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
- getTargetMachine().getSubtargetImpl()->getFrameLowering());
+ const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
@@ -866,10 +917,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::AMDGPU_div_fmas:
- // FIXME: Dropping bool parameter. Work is needed to support the implicit
- // read from VCC.
return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4));
case Intrinsic::AMDGPU_div_fixup:
return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
@@ -889,7 +939,19 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::AMDGPU_rsq_clamped:
- return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ Type *Type = VT.getTypeForEVT(*DAG.getContext());
+ APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+ APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+ SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+ DAG.getConstantFP(Max, VT));
+ return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+ DAG.getConstantFP(Min, VT));
+ } else {
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+ }
case Intrinsic::AMDGPU_ldexp:
return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
@@ -962,6 +1024,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case AMDGPUIntrinsic::AMDGPU_brev:
return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+ case Intrinsic::AMDGPU_class:
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
@@ -1000,17 +1066,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
}
/// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return SDValue();
+
if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
switch (CCOpcode) {
case ISD::SETOEQ:
@@ -1027,27 +1097,47 @@ SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
case ISD::SETO:
break;
case ISD::SETULE:
- case ISD::SETULT:
+ case ISD::SETULT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ }
case ISD::SETOLE:
case ISD::SETOLT:
case ISD::SETLE:
case ISD::SETLT: {
+ // Ordered. Assume ordered for undefined.
+
+ // Only do this after legalization to avoid interfering with other combines
+ // which might occur.
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
// We need to permute the operands to get the correct NaN behavior. The
// selected operand is the second one based on the failing compare with NaN,
// so permute it based on the compare type the hardware uses.
if (LHS == True)
- return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
- return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ }
+ case ISD::SETUGE:
+ case ISD::SETUGT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
}
case ISD::SETGT:
case ISD::SETGE:
- case ISD::SETUGE:
case ISD::SETOGE:
- case ISD::SETUGT:
case ISD::SETOGT: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
if (LHS == True)
- return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
- return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
}
case ISD::SETCC_INVALID:
llvm_unreachable("Invalid setcc condcode!");
@@ -1330,24 +1420,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
EVT MemVT = Load->getMemoryVT();
- if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
- // We can do the extload to 32-bits, and then need to separately extend to
- // 64-bits.
-
- SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
- Load->getChain(),
- Load->getBasePtr(),
- MemVT,
- Load->getMemOperand());
-
- SDValue Ops[] = {
- DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
- ExtLoad32.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, DL);
- }
-
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
assert(VT == MVT::i1 && "Only i1 non-extloads expected");
// FIXME: Copied from PPC
@@ -1586,12 +1658,26 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+ if (VT == MVT::i64 &&
+ DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+ DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+ SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+ LHS_Lo, RHS_Lo);
+
+ SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
+ Results.push_back(DIV);
+ Results.push_back(REM);
+ return;
+ }
+
// Get Speculative values
SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
- SDValue REM_Hi = zero;
SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
SDValue DIV_Lo = zero;
@@ -1599,8 +1685,10 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
const unsigned halfBitWidth = HalfVT.getSizeInBits();
for (unsigned i = 0; i < halfBitWidth; ++i) {
- SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
- // Get Value of high bit
+ const unsigned bitPos = halfBitWidth - i - 1;
+ SDValue POS = DAG.getConstant(bitPos, HalfVT);
+ // Get value of high bit
+ // TODO: Remove the BFE part when the optimization is fixed
SDValue HBit;
if (halfBitWidth == 32 && Subtarget->hasBFE()) {
HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
@@ -1608,33 +1696,23 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
}
+ HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
- SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
- DAG.getConstant(halfBitWidth - 1, HalfVT));
- REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
- REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
- REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
- REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+ // Shift
+ REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT));
+ // Add LHS high bit
+ REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
-
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
- SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+ SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT);
SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
// Update REM
-
SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
- REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
- REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
}
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
Results.push_back(DIV);
Results.push_back(REM);
@@ -1655,8 +1733,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
SDValue Den = Op.getOperand(1);
if (VT == MVT::i32) {
- if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
- DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
+ if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
+ DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
// TODO: We technically could do this for i64, but shouldn't that just be
// handled by something generally reducing 64-bit division on 32-bit
// values to 32-bit?
@@ -1768,19 +1846,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- if (VT == MVT::i32) {
- if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
- DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
- // TODO: We technically could do this for i64, but shouldn't that just be
- // handled by something generally reducing 64-bit division on 32-bit
- // values to 32-bit?
- return LowerDIVREM24(Op, DAG, true);
- }
- }
-
SDValue Zero = DAG.getConstant(0, VT);
SDValue NegOne = DAG.getConstant(-1, VT);
+ if (VT == MVT::i32 &&
+ DAG.ComputeNumSignBits(LHS) > 8 &&
+ DAG.ComputeNumSignBits(RHS) > 8) {
+ return LowerDIVREM24(Op, DAG, true);
+ }
+ if (VT == MVT::i64 &&
+ DAG.ComputeNumSignBits(LHS) > 32 &&
+ DAG.ComputeNumSignBits(RHS) > 32) {
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+ //HiLo split
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+ SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+ LHS_Lo, RHS_Lo);
+ SDValue Res[2] = {
+ DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+ DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+ };
+ return DAG.getMergeValues(Res, DL);
+ }
+
SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
@@ -1845,6 +1935,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
+static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+ const unsigned FractBits = 52;
+ const unsigned ExpBits = 11;
+
+ SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+ Hi,
+ DAG.getConstant(FractBits - 32, MVT::i32),
+ DAG.getConstant(ExpBits, MVT::i32));
+ SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+ DAG.getConstant(1023, MVT::i32));
+
+ return Exp;
+}
+
SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -1860,16 +1964,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
// exponent.
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
- const unsigned FractBits = 52;
- const unsigned ExpBits = 11;
+ SDValue Exp = extractF64Exponent(Hi, SL, DAG);
- // Extract the exponent.
- SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
- Hi,
- DAG.getConstant(FractBits - 32, MVT::i32),
- DAG.getConstant(ExpBits, MVT::i32));
- SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
- DAG.getConstant(1023, MVT::i32));
+ const unsigned FractBits = 52;
// Extract the sign bit.
const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
@@ -1932,6 +2029,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
}
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+
+ SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+ SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+ SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
+ const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
+
+ SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+ SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+ SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+ return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+
+ SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+ const SDValue Zero = DAG.getConstant(0, MVT::i32);
+ const SDValue One = DAG.getConstant(1, MVT::i32);
+ const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
+ const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+ SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+ const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
+
+ SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+ SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+ DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
+ Exp);
+
+ SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+ SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+ DAG.getConstant(0, MVT::i64), Tmp0,
+ ISD::SETNE);
+
+ SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+ D, DAG.getConstant(0, MVT::i64));
+ SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+ K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+ K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+ SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+ SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+ SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+ SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+ ExpEqNegOne,
+ DAG.getConstantFP(1.0, MVT::f64),
+ DAG.getConstantFP(0.0, MVT::f64));
+
+ SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+ K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+ K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+ return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::f32)
+ return LowerFROUND32(Op, DAG);
+
+ if (VT == MVT::f64)
+ return LowerFROUND64(Op, DAG);
+
+ llvm_unreachable("unhandled type");
+}
+
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2155,7 +2345,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SDValue Value = SN->getValue();
EVT VT = Value.getValueType();
- if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+ if (isTypeLegal(VT) || SN->isVolatile() ||
+ !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
return SDValue();
LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
@@ -2231,27 +2422,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
simplifyI24(N1, DCI);
return SDValue();
}
- case ISD::SELECT_CC: {
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- if (VT == MVT::f32 ||
- (VT == MVT::f64 &&
- Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- SDValue True = N->getOperand(2);
- SDValue False = N->getOperand(3);
- SDValue CC = N->getOperand(4);
-
- return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
- }
-
- break;
- }
case ISD::SELECT: {
SDValue Cond = N->getOperand(0);
- if (Cond.getOpcode() == ISD::SETCC) {
+ if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue LHS = Cond.getOperand(0);
@@ -2261,11 +2434,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
SDValue True = N->getOperand(1);
SDValue False = N->getOperand(2);
- if (VT == MVT::f32 ||
- (VT == MVT::f64 &&
- Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
- return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
- }
+ if (VT == MVT::f32)
+ return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
// TODO: Implement min / max Evergreen instructions.
if (VT == MVT::i32 &&
@@ -2451,7 +2621,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(CLAMP)
- NODE_NAME_CASE(MAD)
NODE_NAME_CASE(FMAX_LEGACY)
NODE_NAME_CASE(SMAX)
NODE_NAME_CASE(UMAX)
@@ -2474,6 +2643,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ_LEGACY)
NODE_NAME_CASE(RSQ_CLAMPED)
NODE_NAME_CASE(LDEXP)
+ NODE_NAME_CASE(FP_CLASS)
NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(BFE_U32)
NODE_NAME_CASE(BFE_I32)
@@ -2505,6 +2675,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
}
}
+SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rsq instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ // Reciprocal, < 1 ulp error.
+ //
+ // This reciprocal approximation converges to < 0.5 ulp error with one
+ // newton rhapson performed with two fused multiple adds (FMAs).
+
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rcp instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
static void computeKnownBitsForMinMax(const SDValue Op0,
const SDValue Op1,
APInt &KnownZero,
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 36b4ee6..6bc6ca5 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -43,12 +43,15 @@ private:
/// \brief Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
- SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
@@ -86,6 +89,7 @@ protected:
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const;
@@ -106,7 +110,7 @@ protected:
const SmallVectorImpl<ISD::InputArg> &Ins) const;
public:
- AMDGPUTargetLowering(TargetMachine &TM);
+ AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
bool isFAbsFree(EVT VT) const override;
bool isFNegFree(EVT VT) const override;
@@ -124,8 +128,14 @@ public:
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
bool ShouldShrinkFPConstant(EVT VT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtType,
+ EVT ExtVT) const override;
bool isLoadBitCastBeneficial(EVT, EVT) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -142,14 +152,14 @@ public:
SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
- SDValue CombineFMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const;
+ SDValue CombineFMinMaxLegacy(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const;
SDValue CombineIMinMax(SDLoc DL,
EVT VT,
SDValue LHS,
@@ -161,6 +171,14 @@ public:
const char* getTargetNodeName(unsigned Opcode) const override;
+ SDValue getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const override;
+ SDValue getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const override;
+
virtual SDNode *PostISelFolding(MachineSDNode *N,
SelectionDAG &DAG) const {
return N;
@@ -200,7 +218,6 @@ enum {
DWORDADDR,
FRACT,
CLAMP,
- MAD, // Multiply + add with same result as the separate operations.
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
// Denormals handled on some parts.
@@ -231,6 +248,7 @@ enum {
RSQ_LEGACY,
RSQ_CLAMPED,
LDEXP,
+ FP_CLASS,
DOT4,
BFE_U32, // Extract range of bits with zero extension to 32-bits.
BFE_I32, // Extract range of bits with sign extension to 32-bits.
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index a8fc614..f4de2d6 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -319,10 +319,7 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
return -1;
}
- Offset = MF.getTarget()
- .getSubtargetImpl()
- ->getFrameLowering()
- ->getFrameIndexOffset(MF, -1);
+ Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
return getIndirectIndexBegin(MF) + Offset;
}
@@ -341,8 +338,39 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
// instead.
namespace llvm {
namespace AMDGPU {
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
- return getMCOpcode(Opcode);
+static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+ return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
}
}
}
+
+// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+enum SISubtarget {
+ SI = 0,
+ VI = 1
+};
+
+static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) {
+ switch (Gen) {
+ default:
+ return SI;
+ case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+ return VI;
+ }
+}
+
+int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
+ int MCOp = AMDGPU::getMCOpcode(Opcode,
+ AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration()));
+
+ // -1 means that Opcode is already a native instruction.
+ if (MCOp == -1)
+ return Opcode;
+
+ // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+ // no encoding in the given subtarget generation.
+ if (MCOp == (uint16_t)-1)
+ return -1;
+
+ return MCOp;
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index da9833d..202183c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -135,6 +135,17 @@ public:
bool isRegisterStore(const MachineInstr &MI) const;
bool isRegisterLoad(const MachineInstr &MI) const;
+ /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+ /// Return -1 if the target-specific opcode for the pseudo instruction does
+ /// not exist. If Opcode is not a pseudo instruction, this is identity.
+ int pseudoToMCOpcode(int Opcode) const;
+
+ /// \brief Return the descriptor of the target-specific machine instruction
+ /// that corresponds to the specified pseudo or native opcode.
+ const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+ return get(pseudoToMCOpcode(Opcode));
+ }
+
//===---------------------------------------------------------------------===//
// Pure virtual funtions to be implemented by sub-classes.
//===---------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 4ee0f2b..901eb51 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -27,10 +27,19 @@ def AMDGPULdExpOp : SDTypeProfile<1, 2,
[SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
>;
+def AMDGPUFPClassOp : SDTypeProfile<1, 2,
+ [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
+// float, float, float, vcc
+def AMDGPUFmasOp : SDTypeProfile<1, 4,
+ [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
+>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -58,16 +67,17 @@ def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+
// out = max(a, b) a and b are floats, where a nan comparison fails.
// This is not commutative because this gives the second operand:
// x < nan ? x : nan -> nan
// nan < x ? nan : x -> x
def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
- [SDNPAssociative]
+ []
>;
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>;
// out = max(a, b) a and b are signed ints
def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
@@ -81,7 +91,7 @@ def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
// out = min(a, b) a and b are floats, where a nan comparison fails.
def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
- [SDNPAssociative]
+ []
>;
// out = min(a, b) a and b are signed ints
@@ -147,7 +157,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
// Special case divide FMA with scale and flags (src0 = Quotient,
// src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
// Single or double precision division fixup.
// Special case divide fixup and flags(src0 = Quotient, src1 =
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index c215865..849b241 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -23,8 +23,6 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio
let Pattern = pattern;
let Itinerary = NullALU;
- let isCodeGenOnly = 1;
-
let TSFlags{63} = isRegisterLoad;
let TSFlags{62} = isRegisterStore;
}
@@ -73,6 +71,11 @@ def COND_OEQ : PatLeaf <
[{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
>;
+def COND_ONE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
+>;
+
def COND_OGT : PatLeaf <
(cond),
[{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
@@ -93,23 +96,28 @@ def COND_OLE : PatLeaf <
[{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
>;
-def COND_UNE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
//===----------------------------------------------------------------------===//
-// PatLeafs for unsigned comparisons
+// PatLeafs for unsigned / unordered comparisons
//===----------------------------------------------------------------------===//
+def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
+def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+// XXX - For some reason R600 version is preferring to use unordered
+// for setne?
+def COND_UNE_NE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
//===----------------------------------------------------------------------===//
// PatLeafs for signed comparisons
//===----------------------------------------------------------------------===//
@@ -154,10 +162,6 @@ class PrivateStore <SDPatternOperator op> : PrivateMemOp <
(ops node:$value, node:$ptr), (op node:$value, node:$ptr)
>;
-def extloadi8_private : PrivateLoad <extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
-def extloadi16_private : PrivateLoad <extloadi16>;
-def sextloadi16_private : PrivateLoad <sextloadi16>;
def load_private : PrivateLoad <load>;
def truncstorei8_private : PrivateStore <truncstorei8>;
@@ -221,6 +225,9 @@ def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
return isLocalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+
def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
}]>;
@@ -257,6 +264,9 @@ def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
return isLocalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def extloadi16_private : PrivateLoad <az_extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+
def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
}]>;
@@ -403,11 +413,6 @@ def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
// Misc Pattern Fragments
//===----------------------------------------------------------------------===//
-def fmad : PatFrag <
- (ops node:$src0, node:$src1, node:$src2),
- (fadd (fmul node:$src0, node:$src1), node:$src2)
->;
-
class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
@@ -428,6 +433,11 @@ def FP_ONE : PatLeaf <
[{return N->isExactlyValue(1.0);}]
>;
+def FP_HALF : PatLeaf <
+ (fpimm),
+ [{return N->isExactlyValue(0.5);}]
+>;
+
let isCodeGenOnly = 1, isPseudo = 1 in {
let usesCustomInserter = 1 in {
@@ -575,7 +585,7 @@ applied.
def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
- SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+ SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()), MVT::i32);}]>>;
class BFEPattern <Instruction BFE> : Pat <
(and (srl i32:$x, legalshift32:$y), bfemask:$z),
@@ -593,6 +603,20 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
// 24-bit arithmetic patterns
def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
+// Special conversion patterns
+
+def cvt_rpi_i32_f32 : PatFrag <
+ (ops node:$src),
+ (fp_to_sint (ffloor (fadd $src, FP_HALF))),
+ [{ (void) N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+def cvt_flr_i32_f32 : PatFrag <
+ (ops node:$src),
+ (fp_to_sint (ffloor $src)),
+ [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+>;
+
/*
class UMUL24Pattern <Instruction UMUL24> : Pat <
(mul U24:$x, U24:$y),
@@ -639,17 +663,10 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
(RcpInst $src)
>;
-multiclass RsqPat<Instruction RsqInst, ValueType vt> {
- def : Pat <
- (fdiv FP_ONE, (fsqrt vt:$src)),
- (RsqInst $src)
- >;
-
- def : Pat <
- (AMDGPUrcp (fsqrt vt:$src)),
- (RsqInst $src)
- >;
-}
+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+ (AMDGPUrcp (fsqrt vt:$src)),
+ (RsqInst $src)
+>;
include "R600Instructions.td"
include "R700Instructions.td"
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index bca027f..f047ed0 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -22,6 +22,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -39,37 +40,23 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
Ctx(ctx), ST(st)
{ }
-enum AMDGPUMCInstLower::SISubtarget
-AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
- return AMDGPUMCInstLower::SI;
-}
-
-unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
- int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
- AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
- if (MCOpcode == -1)
- MCOpcode = MIOpcode;
+ int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
- return MCOpcode;
-}
-
-void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+ if (MCOpcode == -1) {
+ LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+ C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
+ "a target-specific version: " + Twine(MI->getOpcode()));
+ }
- OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
+ OutMI.setOpcode(MCOpcode);
for (const MachineOperand &MO : MI->explicit_operands()) {
MCOperand MCOp;
switch (MO.getType()) {
default:
llvm_unreachable("unknown operand type");
- case MachineOperand::MO_FPImmediate: {
- const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
- assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
- "Only floating point immediates are supported at the moment.");
- MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
- break;
- }
case MachineOperand::MO_Immediate:
MCOp = MCOperand::CreateImm(MO.getImm());
break;
@@ -93,18 +80,24 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
MCOp = MCOperand::CreateExpr(Expr);
break;
}
+ case MachineOperand::MO_ExternalSymbol: {
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(MO.getSymbolName()));
+ const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ MCOp = MCOperand::CreateExpr(Expr);
+ break;
+ }
}
OutMI.addOperand(MCOp);
}
}
void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
- AMDGPUMCInstLower MCInstLowering(OutContext,
- MF->getTarget().getSubtarget<AMDGPUSubtarget>());
+ const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+ AMDGPUMCInstLower MCInstLowering(OutContext, STI);
#ifdef _DEBUG
StringRef Err;
- if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) {
+ if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
errs() << "Warning: Illegal instruction detected: " << Err << "\n";
MI->dump();
}
@@ -122,15 +115,15 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(OutStreamer, TmpInst);
- if (DisasmEnabled) {
+ if (STI.dumpCode()) {
// Disassemble instruction/operands to text.
DisasmLines.resize(DisasmLines.size() + 1);
std::string &DisasmLine = DisasmLines.back();
raw_string_ostream DisasmStream(DisasmLine);
AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
- *TM.getSubtargetImpl()->getInstrInfo(),
- *TM.getSubtargetImpl()->getRegisterInfo());
+ *MF->getSubtarget().getInstrInfo(),
+ *MF->getSubtarget().getRegisterInfo());
InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
// Disassemble instruction/operands to hex representation.
@@ -141,7 +134,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups,
- TM.getSubtarget<MCSubtargetInfo>());
+ MF->getSubtarget<MCSubtargetInfo>());
CodeStream.flush();
HexLines.resize(HexLines.size() + 1);
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 00d1f1b..d322fe0 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -19,22 +19,9 @@ class MCContext;
class MCInst;
class AMDGPUMCInstLower {
-
- // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
- enum SISubtarget {
- SI = 0
- };
-
MCContext &Ctx;
const AMDGPUSubtarget &ST;
- /// Convert a member of the AMDGPUSubtarget::Generation enum to the
- /// SISubtarget enum.
- enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
-
- /// Get the MC opcode for this MachineInstr.
- unsigned getMCOpcode(unsigned MIOpcode) const;
-
public:
AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 0f3f9e2..21c7da6 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -15,9 +15,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
LDSSize(0),
ScratchSize(0),
IsKernel(true) {
- AttributeSet Set = MF.getFunction()->getAttributes();
- Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
- ShaderTypeAttribute);
+ Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
if (A.isStringAttribute()) {
StringRef Str = A.getValueAsString();
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 3433280..57b054b 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -42,8 +42,7 @@ void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- assert(!"Subroutines not supported yet");
- return 0;
+ return AMDGPU::NoRegister;
}
unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 9d09a19..70c8525 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -16,11 +16,11 @@
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineScheduler.h"
-#include "SIInstrInfo.h"
#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
-
-#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
using namespace llvm;
@@ -31,22 +31,9 @@ using namespace llvm;
#define GET_SUBTARGETINFO_CTOR
#include "AMDGPUGenSubtargetInfo.inc"
-static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
- std::string Ret = "e-p:32:32";
-
- if (ST.is64bit()) {
- // 32-bit private, local, and region pointers. 64-bit global and constant.
- Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
- }
-
- Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
- "-v512:512-v1024:1024-v2048:2048-n32:64";
-
- return Ret;
-}
-
AMDGPUSubtarget &
-AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) {
+AMDGPUSubtarget::initializeSubtargetDependencies(StringRef TT, StringRef GPU,
+ StringRef FS) {
// Determine default and user-specified characteristics
// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
// enabled, but some instructions do not respect them and they run at the
@@ -59,6 +46,9 @@ AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) {
SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
FullFS += FS;
+ if (GPU == "" && Triple(TT).getArch() == Triple::amdgcn)
+ GPU = "SI";
+
ParseSubtargetFeatures(GPU, FullFS);
// FIXME: I don't think think Evergreen has any useful support for
@@ -76,21 +66,24 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
: AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
DumpCode(false), R600ALUInst(false), HasVertexCache(false),
TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
- FP64Denormals(false), FP32Denormals(false), CaymanISA(false),
- FlatAddressSpace(false), EnableIRStructurizer(true),
- EnablePromoteAlloca(false), EnableIfCvt(true),
- EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
- DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
+ FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
+ CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
+ EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
+ WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+ EnableVGPRSpilling(false),
FrameLowering(TargetFrameLowering::StackGrowsUp,
64 * 16, // Maximum stack alignment (long16)
0),
- InstrItins(getInstrItineraryForCPU(GPU)) {
+ InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
+
+ initializeSubtargetDependencies(TT, GPU, FS);
+
if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
InstrInfo.reset(new R600InstrInfo(*this));
- TLInfo.reset(new R600TargetLowering(TM));
+ TLInfo.reset(new R600TargetLowering(TM, *this));
} else {
InstrInfo.reset(new SIInstrInfo(*this));
- TLInfo.reset(new SITargetLowering(TM));
+ TLInfo.reset(new SITargetLowering(TM, *this));
}
}
@@ -107,3 +100,33 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
llvm_unreachable("Illegal wavefront size.");
}
}
+
+unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
+ switch(getGeneration()) {
+ default: llvm_unreachable("ChipID unknown");
+ case SEA_ISLANDS: return 12;
+ }
+}
+
+bool AMDGPUSubtarget::isVGPRSpillingEnabled(
+ const SIMachineFunctionInfo *MFI) const {
+ return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+}
+
+void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin,
+ MachineInstr *end,
+ unsigned NumRegionInstrs) const {
+ if (getGeneration() >= SOUTHERN_ISLANDS) {
+
+ // Track register pressure so the scheduler can try to decrease
+ // pressure once register usage is above the threshold defined by
+ // SIRegisterInfo::getRegPressureSetLimit()
+ Policy.ShouldTrackPressure = true;
+
+ // Enabling both top down and bottom up scheduling seems to give us less
+ // register spills than just using one of these approaches on its own.
+ Policy.OnlyTopDown = false;
+ Policy.OnlyBottomUp = false;
+ }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index f71d80a..1b0122c 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -20,7 +20,6 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "R600ISelLowering.h"
-#include "llvm/IR/DataLayout.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Target/TargetSubtargetInfo.h"
@@ -30,6 +29,8 @@
namespace llvm {
+class SIMachineFunctionInfo;
+
class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
public:
@@ -39,7 +40,8 @@ public:
EVERGREEN,
NORTHERN_ISLANDS,
SOUTHERN_ISLANDS,
- SEA_ISLANDS
+ SEA_ISLANDS,
+ VOLCANIC_ISLANDS,
};
private:
@@ -53,6 +55,7 @@ private:
bool FP64;
bool FP64Denormals;
bool FP32Denormals;
+ bool FastFMAF32;
bool CaymanISA;
bool FlatAddressSpace;
bool EnableIRStructurizer;
@@ -62,16 +65,18 @@ private:
unsigned WavefrontSize;
bool CFALUBug;
int LocalMemorySize;
+ bool EnableVGPRSpilling;
- const DataLayout DL;
AMDGPUFrameLowering FrameLowering;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;
std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
InstrItineraryData InstrItins;
+ Triple TargetTriple;
public:
AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM);
- AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS);
+ AMDGPUSubtarget &initializeSubtargetDependencies(StringRef TT, StringRef GPU,
+ StringRef FS);
const AMDGPUFrameLowering *getFrameLowering() const override {
return &FrameLowering;
@@ -85,7 +90,6 @@ public:
AMDGPUTargetLowering *getTargetLowering() const override {
return TLInfo.get();
}
- const DataLayout *getDataLayout() const override { return &DL; }
const InstrItineraryData *getInstrItineraryData() const override {
return &InstrItins;
}
@@ -124,6 +128,10 @@ public:
return FP64Denormals;
}
+ bool hasFastFMAF32() const {
+ return FastFMAF32;
+ }
+
bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
@@ -198,10 +206,16 @@ public:
return LocalMemorySize;
}
+ unsigned getAmdKernelCodeChipID() const;
+
bool enableMachineScheduler() const override {
- return getGeneration() <= NORTHERN_ISLANDS;
+ return true;
}
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ MachineInstr *begin, MachineInstr *end,
+ unsigned NumRegionInstrs) const override;
+
// Helper functions to simplify if statements
bool isTargetELF() const {
return false;
@@ -217,6 +231,22 @@ public:
bool r600ALUEncoding() const {
return R600ALUInst;
}
+ bool isAmdHsaOS() const {
+ return TargetTriple.getOS() == Triple::AMDHSA;
+ }
+ bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+
+ unsigned getMaxWavesPerCU() const {
+ if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ return 10;
+
+ // FIXME: Not sure what this is for other subtagets.
+ llvm_unreachable("do not know max waves per CU for this subtarget.");
+ }
+
+ bool enableSubRegLiveness() const override {
+ return false;
+ }
};
} // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index b2cd988..a862f3c 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
+#include "AMDGPUTargetTransformInfo.h"
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineScheduler.h"
@@ -27,7 +28,7 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Verifier.h"
#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_os_ostream.h"
#include "llvm/Transforms/IPO.h"
@@ -38,7 +39,8 @@ using namespace llvm;
extern "C" void LLVMInitializeR600Target() {
// Register the target
- RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+ RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
+ RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -49,12 +51,28 @@ static MachineSchedRegistry
SchedCustomRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
+static std::string computeDataLayout(StringRef TT) {
+ Triple Triple(TT);
+ std::string Ret = "e-p:32:32";
+
+ if (Triple.getArch() == Triple::amdgcn) {
+ // 32-bit private, local, and region pointers. 64-bit global and constant.
+ Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+ }
+
+ Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+ "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+ return Ret;
+}
+
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS,
TargetOptions Options, Reloc::Model RM,
CodeModel::Model CM,
CodeGenOpt::Level OptLevel)
: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+ DL(computeDataLayout(TT)),
TLOF(new TargetLoweringObjectFileELF()),
Subtarget(TT, CPU, FS, *this), IntrinsicInfo() {
setRequiresStructuredCFG(true);
@@ -65,10 +83,33 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() {
delete TLOF;
}
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+R600TargetMachine::R600TargetMachine(const Target &T, StringRef TT, StringRef FS,
+ StringRef CPU, TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL) :
+ AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { }
+
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS,
+ StringRef CPU, TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL) :
+ AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { }
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup
+//===----------------------------------------------------------------------===//
+
namespace {
class AMDGPUPassConfig : public TargetPassConfig {
public:
- AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+ AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
: TargetPassConfig(TM, PM) {}
AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
@@ -85,29 +126,38 @@ public:
void addIRPasses() override;
void addCodeGenPrepare() override;
+ virtual bool addPreISel() override;
+ virtual bool addInstSelector() override;
+};
+
+class R600PassConfig : public AMDGPUPassConfig {
+public:
+ R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+ : AMDGPUPassConfig(TM, PM) { }
+
bool addPreISel() override;
- bool addInstSelector() override;
- bool addPreRegAlloc() override;
- bool addPostRegAlloc() override;
- bool addPreSched2() override;
- bool addPreEmitPass() override;
+ void addPreRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
};
-} // End of anonymous namespace
-TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
- return new AMDGPUPassConfig(this, PM);
-}
+class GCNPassConfig : public AMDGPUPassConfig {
+public:
+ GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+ : AMDGPUPassConfig(TM, PM) { }
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
+};
-//===----------------------------------------------------------------------===//
-// AMDGPU Analysis Pass Setup
-//===----------------------------------------------------------------------===//
+} // End of anonymous namespace
-void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
- // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This
- // allows the AMDGPU pass to delegate to the target independent layer when
- // appropriate.
- PM.add(createBasicTargetTransformInfoPass(this));
- PM.add(createAMDGPUTargetTransformInfoPass(this));
+TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis(
+ [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); });
}
void AMDGPUPassConfig::addIRPasses() {
@@ -129,7 +179,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
addPass(createAMDGPUPromoteAlloca(ST));
addPass(createSROAPass());
}
-
TargetPassConfig::addCodeGenPrepare();
}
@@ -139,84 +188,96 @@ AMDGPUPassConfig::addPreISel() {
addPass(createFlattenCFGPass());
if (ST.IsIRStructurizerEnabled())
addPass(createStructurizeCFGPass());
- if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- addPass(createSinkingPass());
- addPass(createSITypeRewriter());
- addPass(createSIAnnotateControlFlowPass());
- } else {
- addPass(createR600TextureIntrinsicsReplacer());
- }
return false;
}
bool AMDGPUPassConfig::addInstSelector() {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-
addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+ return false;
+}
- if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- addPass(createSILowerI1CopiesPass());
- addPass(createSIFixSGPRCopiesPass(*TM));
- }
+//===----------------------------------------------------------------------===//
+// R600 Pass Setup
+//===----------------------------------------------------------------------===//
+bool R600PassConfig::addPreISel() {
+ AMDGPUPassConfig::addPreISel();
+ addPass(createR600TextureIntrinsicsReplacer());
return false;
}
-bool AMDGPUPassConfig::addPreRegAlloc() {
+void R600PassConfig::addPreRegAlloc() {
+ addPass(createR600VectorRegMerger(*TM));
+}
+
+void R600PassConfig::addPreSched2() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+ addPass(createR600EmitClauseMarkers(), false);
+ if (ST.isIfCvtEnabled())
+ addPass(&IfConverterID, false);
+ addPass(createR600ClauseMergePass(*TM), false);
+}
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- addPass(createR600VectorRegMerger(*TM));
- } else {
- if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
- // Don't do this with no optimizations since it throws away debug info by
- // merging nonadjacent loads.
-
- // This should be run after scheduling, but before register allocation. It
- // also need extra copies to the address operand to be eliminated.
- initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
- insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
- }
-
- addPass(createSIShrinkInstructionsPass());
- addPass(createSIFixSGPRLiveRangesPass());
- }
- return false;
+void R600PassConfig::addPreEmitPass() {
+ addPass(createAMDGPUCFGStructurizerPass(), false);
+ addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+ addPass(&FinalizeMachineBundlesID, false);
+ addPass(createR600Packetizer(*TM), false);
+ addPass(createR600ControlFlowFinalizer(*TM), false);
}
-bool AMDGPUPassConfig::addPostRegAlloc() {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new R600PassConfig(this, PM);
+}
- addPass(createSIShrinkInstructionsPass());
- if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
- addPass(createSIInsertWaits(*TM));
- }
+//===----------------------------------------------------------------------===//
+// GCN Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool GCNPassConfig::addPreISel() {
+ AMDGPUPassConfig::addPreISel();
+ addPass(createSinkingPass());
+ addPass(createSITypeRewriter());
+ addPass(createSIAnnotateControlFlowPass());
return false;
}
-bool AMDGPUPassConfig::addPreSched2() {
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- addPass(createR600EmitClauseMarkers());
- if (ST.isIfCvtEnabled())
- addPass(&IfConverterID);
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- addPass(createR600ClauseMergePass(*TM));
+bool GCNPassConfig::addInstSelector() {
+ AMDGPUPassConfig::addInstSelector();
+ addPass(createSILowerI1CopiesPass());
+ addPass(createSIFixSGPRCopiesPass(*TM));
+ addPass(createSIFoldOperandsPass());
return false;
}
-bool AMDGPUPassConfig::addPreEmitPass() {
+void GCNPassConfig::addPreRegAlloc() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- addPass(createAMDGPUCFGStructurizerPass());
- addPass(createR600ExpandSpecialInstrsPass(*TM));
- addPass(&FinalizeMachineBundlesID);
- addPass(createR600Packetizer(*TM));
- addPass(createR600ControlFlowFinalizer(*TM));
- } else {
- addPass(createSILowerControlFlowPass(*TM));
+ if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+ // Don't do this with no optimizations since it throws away debug info by
+ // merging nonadjacent loads.
+
+ // This should be run after scheduling, but before register allocation. It
+ // also need extra copies to the address operand to be eliminated.
+ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+ insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
}
+ addPass(createSIShrinkInstructionsPass(), false);
+ addPass(createSIFixSGPRLiveRangesPass(), false);
+}
- return false;
+void GCNPassConfig::addPostRegAlloc() {
+ addPass(createSIPrepareScratchRegs(), false);
+ addPass(createSIShrinkInstructionsPass(), false);
+}
+
+void GCNPassConfig::addPreSched2() {
+ addPass(createSIInsertWaits(*TM), false);
+}
+
+void GCNPassConfig::addPreEmitPass() {
+ addPass(createSILowerControlFlowPass(*TM), false);
+}
+
+TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new GCNPassConfig(this, PM);
}
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 1b3dbce..a691536 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -24,7 +24,15 @@
namespace llvm {
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+)
+//===----------------------------------------------------------------------===//
+
class AMDGPUTargetMachine : public LLVMTargetMachine {
+private:
+ const DataLayout DL;
+
+protected:
TargetLoweringObjectFile *TLOF;
AMDGPUSubtarget Subtarget;
AMDGPUIntrinsicInfo IntrinsicInfo;
@@ -34,21 +42,52 @@ public:
StringRef CPU, TargetOptions Options, Reloc::Model RM,
CodeModel::Model CM, CodeGenOpt::Level OL);
~AMDGPUTargetMachine();
+ // FIXME: This is currently broken, the DataLayout needs to move to
+ // the target machine.
+ const DataLayout *getDataLayout() const override {
+ return &DL;
+ }
const AMDGPUSubtarget *getSubtargetImpl() const override {
return &Subtarget;
}
const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
return &IntrinsicInfo;
}
- TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetIRAnalysis getTargetIRAnalysis() override;
- /// \brief Register R600 analysis passes with a pass manager.
- void addAnalysisPasses(PassManagerBase &PM) override;
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF;
}
};
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+class R600TargetMachine : public AMDGPUTargetMachine {
+
+public:
+ R600TargetMachine(const Target &T, StringRef TT, StringRef FS,
+ StringRef CPU, TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine : public AMDGPUTargetMachine {
+
+public:
+ GCNTargetMachine(const Target &T, StringRef TT, StringRef FS,
+ StringRef CPU, TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+};
+
} // End namespace llvm
#endif
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index e7bc006..68f4600 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -15,11 +15,11 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetTransformInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
@@ -27,80 +27,10 @@ using namespace llvm;
#define DEBUG_TYPE "AMDGPUtti"
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeAMDGPUTTIPass(PassRegistry &);
-}
-
-namespace {
-
-class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
- const AMDGPUTargetMachine *TM;
- const AMDGPUSubtarget *ST;
- const AMDGPUTargetLowering *TLI;
-
- /// Estimate the overhead of scalarizing an instruction. Insert and Extract
- /// are set if the result needs to be inserted and/or extracted from vectors.
- unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
- AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
- llvm_unreachable("This pass cannot be directly constructed");
- }
-
- AMDGPUTTI(const AMDGPUTargetMachine *TM)
- : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
- TLI(TM->getSubtargetImpl()->getTargetLowering()) {
- initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
- }
-
- void initializePass() override { pushTTIStack(this); }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- TargetTransformInfo::getAnalysisUsage(AU);
- }
-
- /// Pass identification.
- static char ID;
-
- /// Provide necessary pointer adjustments for the two base classes.
- void *getAdjustedAnalysisPointer(const void *ID) override {
- if (ID == &TargetTransformInfo::ID)
- return (TargetTransformInfo *)this;
- return this;
- }
-
- bool hasBranchDivergence() const override;
-
- void getUnrollingPreferences(const Function *F, Loop *L,
- UnrollingPreferences &UP) const override;
-
- PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;
-
- unsigned getNumberOfRegisters(bool Vector) const override;
- unsigned getRegisterBitWidth(bool Vector) const override;
- unsigned getMaxInterleaveFactor() const override;
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
- "AMDGPU Target Transform Info", true, true, false)
-char AMDGPUTTI::ID = 0;
-
-ImmutablePass *
-llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
- return new AMDGPUTTI(TM);
-}
-
-bool AMDGPUTTI::hasBranchDivergence() const { return true; }
-
-void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
- UnrollingPreferences &UP) const {
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
UP.Threshold = 300; // Twice the default.
- UP.Count = UINT_MAX;
+ UP.MaxCount = UINT_MAX;
UP.Partial = true;
// TODO: Do we want runtime unrolling?
@@ -130,13 +60,7 @@ void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
}
}
-AMDGPUTTI::PopcntSupportKind
-AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
- assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
- return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
-}
-
-unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
if (Vec)
return 0;
@@ -147,11 +71,9 @@ unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
-unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
- return 32;
-}
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
-unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
+unsigned AMDGPUTTIImpl::getMaxInterleaveFactor() {
// Semi-arbitrary large amount.
return 64;
}
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.h b/lib/Target/R600/AMDGPUTargetTransformInfo.h
new file mode 100644
index 0000000..4abbdf2
--- /dev/null
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.h
@@ -0,0 +1,78 @@
+//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AMDGPU target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
+ typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const AMDGPUSubtarget *ST;
+ const AMDGPUTargetLowering *TLI;
+
+ const AMDGPUSubtarget *getST() const { return ST; }
+ const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM)
+ : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+
+ // Provide value semantics. MSVC requires that we spell all of these out.
+ AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
+ : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+ AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
+ : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+ TLI(std::move(Arg.TLI)) {}
+ AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) {
+ BaseT::operator=(static_cast<const BaseT &>(RHS));
+ ST = RHS.ST;
+ TLI = RHS.TLI;
+ return *this;
+ }
+ AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) {
+ BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+ ST = std::move(RHS.ST);
+ TLI = std::move(RHS.TLI);
+ return *this;
+ }
+
+ bool hasBranchDivergence() { return true; }
+
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
+ }
+
+ unsigned getNumberOfRegisters(bool Vector);
+ unsigned getRegisterBitWidth(bool Vector);
+ unsigned getMaxInterleaveFactor();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h
new file mode 100644
index 0000000..4d3041f
--- /dev/null
+++ b/lib/Target/R600/AMDKernelCodeT.h
@@ -0,0 +1,704 @@
+//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeT.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODET_H
+#define AMDKERNELCODET_H
+
+#include <cstddef>
+#include <cstdint>
+
+//---------------------------------------------------------------------------//
+// AMD Kernel Code, and its dependencies //
+//---------------------------------------------------------------------------//
+
+typedef uint8_t hsa_powertwo8_t;
+typedef uint32_t hsa_ext_code_kind_t;
+typedef uint8_t hsa_ext_brig_profile8_t;
+typedef uint8_t hsa_ext_brig_machine_model8_t;
+typedef uint64_t hsa_ext_control_directive_present64_t;
+typedef uint16_t hsa_ext_exception_kind16_t;
+typedef uint32_t hsa_ext_code_kind32_t;
+
+typedef struct hsa_dim3_s {
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+} hsa_dim3_t;
+
+/// The version of the amd_*_code_t struct. Minor versions must be
+/// backward compatible.
+typedef uint32_t amd_code_version32_t;
+enum amd_code_version_t {
+ AMD_CODE_VERSION_MAJOR = 0,
+ AMD_CODE_VERSION_MINOR = 1
+};
+
+/// The values used to define the number of bytes to use for the
+/// swizzle element size.
+enum amd_element_byte_size_t {
+ AMD_ELEMENT_2_BYTES = 0,
+ AMD_ELEMENT_4_BYTES = 1,
+ AMD_ELEMENT_8_BYTES = 2,
+ AMD_ELEMENT_16_BYTES = 3
+};
+
+/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+/// COMPUTE_PGM_RSRC2 registers.
+typedef uint64_t amd_compute_pgm_resource_register64_t;
+
+/// Every amd_*_code_t has the following properties, which are composed of
+/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
+/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
+/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
+///
+/// (Note that bit fields cannot be used as their layout is
+/// implementation defined in the C standard and so cannot be used to
+/// specify an ABI)
+typedef uint32_t amd_code_property32_t;
+enum amd_code_property_mask_t {
+
+ /// Enable the setup of the SGPR user data registers
+ /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+ /// for initial register state.
+ ///
+ /// The total number of SGPRuser data registers requested must not
+ /// exceed 16. Any requests beyond 16 will be ignored.
+ ///
+ /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+ /// SGPR user data registers enabled up to 16).
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+ /// Control wave ID base counter for GDS ordered-append. Used to set
+ /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+ /// ORDERED_APPEND_MODE also needs to be settable)
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+ /// The interleave (swizzle) element size in bytes required by the
+ /// code for private memory. This must be 2, 4, 8 or 16. This value
+ /// is provided to the finalizer when it is invoked and is recorded
+ /// here. The hardware will interleave the memory requests of each
+ /// lane of a wavefront by this element size to ensure each
+ /// work-item gets a distinct memory memory location. Therefore, the
+ /// finalizer ensures that all load and store operations done to
+ /// private memory do not exceed this size. For example, if the
+ /// element size is 4 (32-bits or dword) and a 64-bit value must be
+ /// loaded, the finalizer will generate two 32-bit loads. This
+ /// ensures that the interleaving will get the the work-item
+ /// specific dword for both halves of the 64-bit value. If it just
+ /// did a 64-bit load then it would get one dword which belonged to
+ /// its own work-item, but the second dword would belong to the
+ /// adjacent lane work-item since the interleaving is in dwords.
+ ///
+ /// The value used must match the value that the runtime configures
+ /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+ /// is generally DWORD.
+ ///
+ /// Use values from the amd_element_byte_size_t enum.
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+ /// Are global memory addresses 64 bits. Must match
+ /// amd_kernel_code_t.hsail_machine_model ==
+ /// HSA_MACHINE_LARGE. Must also match
+ /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+ /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+ AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
+ AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+ /// Indicate if the generated ISA is using a dynamically sized call
+ /// stack. This can happen if calls are implemented using a call
+ /// stack and recursion, alloca or calls to indirect functions are
+ /// present. In these cases the Finalizer cannot compute the total
+ /// private segment size at compile time. In this case the
+ /// workitem_private_segment_byte_size only specifies the statically
+ /// know private segment size, and additional space must be added
+ /// for the call stack.
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+ /// Indicate if code generated has support for debugging.
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT
+};
+
+/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// control directives. These control how the finalizer generates code. This
+/// struct is used both as an argument to hsaFinalizeKernel to specify values for
+/// the control directives, and is used in HsaKernelCode to record the values of
+/// the control directives that the finalize used when generating the code which
+/// either came from the finalizer argument or explicit HSAIL control
+/// directives. See the definition of the control directives in HSA Programmer's
+/// Reference Manual which also defines how the values specified as finalizer
+/// arguments have to agree with the control directives in the HSAIL code.
+typedef struct hsa_ext_control_directives_s {
+ /// This is a bit set indicating which control directives have been
+ /// specified. If the value is 0 then there are no control directives specified
+ /// and the rest of the fields can be ignored. The bits are accessed using the
+ /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
+ /// enabled in this bit set must have the value of all 0s.
+ hsa_ext_control_directive_present64_t enabled_control_directives;
+
+ /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
+ /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
+ /// policy enabled. If this set is not empty then the generated code may have
+ /// lower performance than if the set is empty. If the kernel being finalized
+ /// has any enablebreakexceptions control directives, then the values specified
+ /// by this argument are unioned with the values in these control
+ /// directives. If any of the functions the kernel calls have an
+ /// enablebreakexceptions control directive, then they must be equal or a
+ /// subset of, this union.
+ hsa_ext_exception_kind16_t enable_break_exceptions;
+
+ /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
+ /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
+ /// policy enabled. If this set is not empty then the generated code may have
+ /// lower performance than if the set is empty. However, an implementation
+ /// should endeavour to make the performance impact small. If the kernel being
+ /// finalized has any enabledetectexceptions control directives, then the
+ /// values specified by this argument are unioned with the values in these
+ /// control directives. If any of the functions the kernel calls have an
+ /// enabledetectexceptions control directive, then they must be equal or a
+ /// subset of, this union.
+ hsa_ext_exception_kind16_t enable_detect_exceptions;
+
+ /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
+ /// dynamic group segment can be allocated for a dispatch, otherwise the value
+ /// specifies the maximum number of bytes of dynamic group segment that can be
+ /// allocated for a dispatch. If the kernel being finalized has any
+ /// maxdynamicsize control directives, then the values must be the same, and
+ /// must be the same as this argument if it is enabled. This value can be used
+ /// by the finalizer to determine the maximum number of bytes of group memory
+ /// used by each work-group by adding this value to the group memory required
+ /// for all group segment variables used by the kernel and all functions it
+ /// calls, and group memory used to implement other HSAIL features such as
+ /// fbarriers and the detect exception operations. This can allow the finalizer
+ /// to determine the expected number of work-groups that can be executed by a
+ /// compute unit and allow more resources to be allocated to the work-items if
+ /// it is known that fewer work-groups can be executed due to group memory
+ /// limitations.
+ uint32_t max_dynamic_group_size;
+
+ /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
+ /// than 0. See HSA Programmer's Reference Manual description of
+ /// maxflatgridsize control directive.
+ uint32_t max_flat_grid_size;
+
+ /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
+ /// greater than 0. See HSA Programmer's Reference Manual description of
+ /// maxflatworkgroupsize control directive.
+ uint32_t max_flat_workgroup_size;
+
+ /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
+ /// finalizer is free to generate ISA that may result in any number of
+ /// work-groups executing on a single compute unit. Otherwise, the finalizer
+ /// should attempt to generate ISA that will allow the specified number of
+ /// work-groups to execute on a single compute unit. This is only a hint and
+ /// can be ignored by the finalizer. If the kernel being finalized, or any of
+ /// the functions it calls, has a requested control directive, then the values
+ /// must be the same. This can be used to determine the number of resources
+ /// that should be allocated to a single work-group and work-item. For example,
+ /// a low value may allow more resources to be allocated, resulting in higher
+ /// per work-item performance, as it is known there will never be more than the
+ /// specified number of work-groups actually executing on the compute
+ /// unit. Conversely, a high value may allocate fewer resources, resulting in
+ /// lower per work-item performance, which is offset by the fact it allows more
+ /// work-groups to actually execute on the compute unit.
+ uint32_t requested_workgroups_per_cu;
+
+ /// If not enabled then all elements for Dim3 must be 0, otherwise every
+ /// element must be greater than 0. See HSA Programmer's Reference Manual
+ /// description of requiredgridsize control directive.
+ hsa_dim3_t required_grid_size;
+
+ /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
+ /// 0, and the produced code can be dispatched with any legal work-group range
+ /// consistent with the dispatch dimensions. Otherwise, the code produced must
+ /// always be dispatched with the specified work-group range. No element of the
+ /// specified range must be 0. It must be consistent with required_dimensions
+ /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
+ /// functions it calls, has a requiredworkgroupsize control directive, then the
+ /// values must be the same. Specifying a value can allow the finalizer to
+ /// optimize work-group id operations, and if the number of work-items in the
+ /// work-group is less than the WAVESIZE then barrier operations can be
+ /// optimized to just a memory fence.
+ hsa_dim3_t required_workgroup_size;
+
+ /// If requiredDim is not enabled then must be 0 and the produced kernel code
+ /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
+ /// 1..3 and the code produced must only be dispatched with a dimension that
+ /// matches. Other values are illegal. If the kernel being finalized, or any of
+ /// the functions it calls, has a requireddimsize control directive, then the
+ /// values must be the same. This can be used to optimize the code generated to
+ /// compute the absolute and flat work-group and work-item id, and the dim
+ /// HSAIL operations.
+ uint8_t required_dim;
+
+ /// Reserved. Must be 0.
+ uint8_t reserved[75];
+} hsa_ext_control_directives_t;
+
+/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
+/// Code Object to set up the hardware to execute the kernel dispatch.
+///
+/// Initial Kernel Register State.
+///
+/// Initial kernel register state will be set up by CP/SPI prior to the start
+/// of execution of every wavefront. This is limited by the constraints of the
+/// current hardware.
+///
+/// The order of the SGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enable_sgpr_* bit fields. The register numbers used for enabled registers
+/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
+/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
+/// number.
+///
+/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
+/// apply to all waves of the grid. It is possible to specify more than 16 User
+/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
+/// are actually initialized. These are then immediately followed by the System
+/// SGPRs that are set up by ADC/SPI and can have different values for each wave
+/// of the grid dispatch.
+///
+/// SGPR register initial state is defined as follows:
+///
+/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
+/// Number of User SGPR registers: 4. V# that can be used, together with
+/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg
+/// segments using a segment address. It must be set as follows:
+/// - Base address: of the scratch memory area used by the dispatch. It
+/// does not include the scratch wave offset. It will be the per process
+/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
+/// example there may be a per pipe offset, or per AQL Queue offset).
+/// - Stride + data_format: Element Size * Index Stride (???)
+/// - Cache swizzle: ???
+/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
+/// scratch)
+/// - Num records: Flat Scratch Work Item Size / Element Size (???)
+/// - Dst_sel_*: ???
+/// - Num_format: ???
+/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
+/// agree with amd_kernel_code_t.privateElementSize)
+/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
+/// be number of wavefront lanes for scratch, must agree with
+/// amd_kernel_code_t.wavefrontSize)
+/// - Add tid enable: 1
+/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
+/// - Hash_enable: ???
+/// - Heap: ???
+/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
+/// - Type: 0 (a buffer) (???)
+///
+/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
+/// for kernel actually executing.
+///
+/// Queue Ptr (enable_sgpr_queue_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
+/// AQL queue on which the dispatch packet was queued.
+///
+/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
+/// is directly copied from the kernargPtr in the dispatch packet. Having CP
+/// load it once avoids loading it at the beginning of every wavefront.
+///
+/// Dispatch Id (enable_sgpr_dispatch_id):
+/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
+/// packet being executed.
+///
+/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
+/// Number of User SGPR registers: 2. This is 2 SGPRs.
+///
+/// For CI/VI:
+/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
+/// to base of memory for scratch for this dispatch. This is the same offset
+/// used in computing the Scratch Segment Buffer base address. The value of
+/// Scratch Wave Offset must be added by the kernel code and moved to
+/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
+///
+/// The second SGPR is 32 bit byte size of a single work-item’s scratch
+/// memory usage. This is directly loaded from the dispatch packet Private
+/// Segment Byte Size and rounded up to a multiple of DWORD.
+///
+/// \todo [Does CP need to round this to >4 byte alignment?]
+///
+/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
+/// flat memory instructions. Having CP load it once avoids loading it at
+/// the beginning of every wavefront.
+///
+/// For PI:
+/// This is the 64 bit base address of the scratch backing memory for
+/// allocated by CP for this dispatch.
+///
+/// Private Segment Size (enable_sgpr_private_segment_size):
+/// Number of User SGPR registers: 1. The 32 bit byte size of a single
+/// work-item’s scratch memory allocation. This is the value from the dispatch
+/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
+///
+/// \todo [Does CP need to round this to >4 byte alignment?]
+///
+/// Having CP load it once avoids loading it at the beginning of every
+/// wavefront.
+///
+/// \todo [This will not be used for CI/VI since it is the same value as
+/// the second SGPR of Flat Scratch Init. However, it is need for PI which
+/// changes meaning of Flat Scratchg Init..]
+///
+/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the X dimension for the grid being executed. Computed from
+/// the fields in the HsaDispatchPacket as
+/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
+///
+/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the Y dimension for the grid being executed. Computed from
+/// the fields in the HsaDispatchPacket as
+/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
+///
+/// Only initialized if <16 previous SGPRs initialized.
+///
+/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the Z dimension for the grid being executed. Computed
+/// from the fields in the HsaDispatchPacket as
+/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
+///
+/// Only initialized if <16 previous SGPRs initialized.
+///
+/// Work-Group Id X (enable_sgpr_workgroup_id_x):
+/// Number of System SGPR registers: 1. 32 bit work group id in X dimension
+/// of grid for wavefront. Always present.
+///
+/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
+/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension
+/// of grid for wavefront.
+///
+/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
+/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension
+/// of grid for wavefront. If present then Work-group Id Y will also be
+/// present
+///
+/// Work-Group Info (enable_sgpr_workgroup_info):
+/// Number of System SGPR registers: 1. {first_wave, 14’b0000,
+/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
+///
+/// Private Segment Wave Byte Offset
+/// (enable_sgpr_private_segment_wave_byte_offset):
+/// Number of System SGPR registers: 1. 32 bit byte offset from base of
+/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg
+/// segment address when using Scratch Segment Buffer. It must be added to
+/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
+///
+///
+/// The order of the VGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enableVgpr* bit fields. The register numbers used for enabled registers
+/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
+/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
+/// number.
+///
+/// VGPR register initial state is defined as follows:
+///
+/// Work-Item Id X (always initialized):
+/// Number of registers: 1. 32 bit work item id in X dimension of work-group
+/// for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+/// Number of registers: 1. 32 bit work item id in Y dimension of work-group
+/// for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+/// Number of registers: 1. 32 bit work item id in Z dimension of work-group
+/// for wavefront lane.
+///
+///
+/// The setting of registers is being done by existing GPU hardware as follows:
+/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+/// registers.
+/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any
+/// combination including none.
+/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot
+/// be added into the value Flat Scratch Offset which would avoid the
+/// Finalizer generated prolog having to do the add.
+/// 4) The VGPRs are set by SPI which only supports specifying either (X),
+/// (X, Y) or (X, Y, Z).
+///
+/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
+/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
+/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
+///
+/// The global segment can be accessed either using flat operations or buffer
+/// operations. If buffer operations are used then the Global Buffer used to
+/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
+/// segment address is not passed into the kernel code by CP since its base
+/// address is always 0. Instead the Finalizer generates prolog code to
+/// initialize 4 SGPRs with a V# that has the following properties, and then
+/// uses that in the buffer instructions:
+/// - base address of 0
+/// - no swizzle
+/// - ATC=1
+/// - MTYPE set to support memory coherence specified in
+/// amd_kernel_code_t.globalMemoryCoherence
+///
+/// When the Global Buffer is used to access the Kernarg segment, must add the
+/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
+/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
+/// the kernarg segment is constant for the duration of the kernel execution.
+///
+typedef struct amd_kernel_code_s {
+ /// The AMD major version of the Code Object. Must be the value
+ /// AMD_CODE_VERSION_MAJOR.
+ amd_code_version32_t amd_code_version_major;
+
+ /// The AMD minor version of the Code Object. Minor versions must be
+ /// backward compatible. Must be the value
+ /// AMD_CODE_VERSION_MINOR.
+ amd_code_version32_t amd_code_version_minor;
+
+ /// The byte size of this struct. Must be set to
+ /// sizeof(amd_kernel_code_t). Used for backward
+ /// compatibility.
+ uint32_t struct_byte_size;
+
+ /// The target chip instruction set for which code has been
+ /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration
+ /// in sc/Interface/SCCommon.h.
+ uint32_t target_chip;
+
+ /// Byte offset (possibly negative) from start of amd_kernel_code_t
+ /// object to kernel's entry point instruction. The actual code for
+ /// the kernel is required to be 256 byte aligned to match hardware
+ /// requirements (SQ cache line is 16). The code must be position
+ /// independent code (PIC) for AMD devices to give runtime the
+ /// option of copying code to discrete GPU memory or APU L2
+ /// cache. The Finalizer should endeavour to allocate all kernel
+ /// machine code in contiguous memory pages so that a device
+ /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
+ /// improving cache performance.
+ int64_t kernel_code_entry_byte_offset;
+
+ /// Range of bytes to consider prefetching expressed as an offset
+ /// and size. The offset is from the start (possibly negative) of
+ /// amd_kernel_code_t object. Set both to 0 if no prefetch
+ /// information is available.
+ ///
+ /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did
+ /// not make the size a uint64_t as prefetching more than 4GiB seems
+ /// excessive.
+ int64_t kernel_code_prefetch_byte_offset;
+ uint64_t kernel_code_prefetch_byte_size;
+
+ /// Number of bytes of scratch backing memory required for full
+ /// occupancy of target chip. This takes into account the number of
+ /// bytes of scratch per work-item, the wavefront size, the maximum
+ /// number of wavefronts per CU, and the number of CUs. This is an
+ /// upper limit on scratch. If the grid being dispatched is small it
+ /// may only need less than this. If the kernel uses no scratch, or
+ /// the Finalizer has not computed this value, it must be 0.
+ uint64_t max_scratch_backing_memory_byte_size;
+
+ /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+ /// COMPUTE_PGM_RSRC2 registers.
+ amd_compute_pgm_resource_register64_t compute_pgm_resource_registers;
+
+ /// Code properties. See amd_code_property_mask_t for a full list of
+ /// properties.
+ amd_code_property32_t code_properties;
+
+ /// The amount of memory required for the combined private, spill
+ /// and arg segments for a work-item in bytes. If
+ /// is_dynamic_callstack is 1 then additional space must be added to
+ /// this value for the call stack.
+ uint32_t workitem_private_segment_byte_size;
+
+ /// The amount of group segment memory required by a work-group in
+ /// bytes. This does not include any dynamically allocated group
+ /// segment memory that may be added when the kernel is
+ /// dispatched.
+ uint32_t workgroup_group_segment_byte_size;
+
+ /// Number of byte of GDS required by kernel dispatch. Must be 0 if
+ /// not using GDS.
+ uint32_t gds_segment_byte_size;
+
+ /// The size in bytes of the kernarg segment that holds the values
+ /// of the arguments to the kernel. This could be used by CP to
+ /// prefetch the kernarg segment pointed to by the dispatch packet.
+ uint64_t kernarg_segment_byte_size;
+
+ /// Number of fbarrier's used in the kernel and all functions it
+ /// calls. If the implementation uses group memory to allocate the
+ /// fbarriers then that amount must already be included in the
+ /// workgroup_group_segment_byte_size total.
+ uint32_t workgroup_fbarrier_count;
+
+ /// Number of scalar registers used by a wavefront. This includes
+ /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+ /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+ /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+ uint16_t wavefront_sgpr_count;
+
+ /// Number of vector registers used by each work-item. Used to set
+ /// COMPUTE_PGM_RSRC1.VGPRS.
+ uint16_t workitem_vgpr_count;
+
+ /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+ /// first fixed VGPR number reserved.
+ uint16_t reserved_vgpr_first;
+
+ /// The number of consecutive VGPRs reserved by the client. If
+ /// is_debug_supported then this count includes VGPRs reserved
+ /// for debugger use.
+ uint16_t reserved_vgpr_count;
+
+ /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+ /// first fixed SGPR number reserved.
+ uint16_t reserved_sgpr_first;
+
+ /// The number of consecutive SGPRs reserved by the client. If
+ /// is_debug_supported then this count includes SGPRs reserved
+ /// for debugger use.
+ uint16_t reserved_sgpr_count;
+
+ /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+ /// fixed SGPR number used to hold the wave scratch offset for the
+ /// entire kernel execution, or uint16_t(-1) if the register is not
+ /// used or not known.
+ uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+ /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+ /// fixed SGPR number of the first of 4 SGPRs used to hold the
+ /// scratch V# used for the entire kernel execution, or uint16_t(-1)
+ /// if the registers are not used or not known.
+ uint16_t debug_private_segment_buffer_sgpr;
+
+ /// The maximum byte alignment of variables used by the kernel in
+ /// the specified memory segment. Expressed as a power of two. Must
+ /// be at least HSA_POWERTWO_16.
+ hsa_powertwo8_t kernarg_segment_alignment;
+ hsa_powertwo8_t group_segment_alignment;
+ hsa_powertwo8_t private_segment_alignment;
+
+ uint8_t reserved3;
+
+ /// Type of code object.
+ hsa_ext_code_kind32_t code_type;
+
+ /// Reserved for code properties if any are defined in the future.
+ /// There are currently no code properties so this field must be 0.
+ uint32_t reserved4;
+
+ /// Wavefront size expressed as a power of two. Must be a power of 2
+ /// in range 1..64 inclusive. Used to support runtime query that
+ /// obtains wavefront size, which may be used by application to
+ /// allocated dynamic group memory and set the dispatch work-group
+ /// size.
+ hsa_powertwo8_t wavefront_size;
+
+ /// The optimization level specified when the kernel was
+ /// finalized.
+ uint8_t optimization_level;
+
+ /// The HSAIL profile defines which features are used. This
+ /// information is from the HSAIL version directive. If this
+ /// amd_kernel_code_t is not generated from an HSAIL compilation
+ /// unit then must be 0.
+ hsa_ext_brig_profile8_t hsail_profile;
+
+ /// The HSAIL machine model gives the address sizes used by the
+ /// code. This information is from the HSAIL version directive. If
+ /// not generated from an HSAIL compilation unit then must still
+ /// indicate for what machine mode the code is generated.
+ hsa_ext_brig_machine_model8_t hsail_machine_model;
+
+ /// The HSAIL major version. This information is from the HSAIL
+ /// version directive. If this amd_kernel_code_t is not
+ /// generated from an HSAIL compilation unit then must be 0.
+ uint32_t hsail_version_major;
+
+ /// The HSAIL minor version. This information is from the HSAIL
+ /// version directive. If this amd_kernel_code_t is not
+ /// generated from an HSAIL compilation unit then must be 0.
+ uint32_t hsail_version_minor;
+
+ /// Reserved for HSAIL target options if any are defined in the
+ /// future. There are currently no target options so this field
+ /// must be 0.
+ uint16_t reserved5;
+
+ /// Reserved. Must be 0.
+ uint16_t reserved6;
+
+ /// The values should be the actually values used by the finalizer
+ /// in generating the code. This may be the union of values
+ /// specified as finalizer arguments and explicit HSAIL control
+ /// directives. If the finalizer chooses to ignore a control
+ /// directive, and not generate constrained code, then the control
+ /// directive should not be marked as enabled even though it was
+ /// present in the HSAIL or finalizer argument. The values are
+ /// intended to reflect the constraints that the code actually
+ /// requires to correctly execute, not the values that were
+ /// actually specified at finalize time.
+ hsa_ext_control_directives_t control_directive;
+
+ /// The code can immediately follow the amd_kernel_code_t, or can
+ /// come after subsequent amd_kernel_code_t structs when there are
+ /// multiple kernels in the compilation unit.
+
+} amd_kernel_code_t;
+
+#endif // AMDKERNELCODET_H
diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
index 7ad815d..3b4ba1a 100644
--- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
@@ -163,23 +163,22 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
- default: break;
- case Match_Success:
- Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
- return false;
- case Match_MissingFeature:
- return Error(IDLoc, "instruction use requires an option to be enabled");
- case Match_MnemonicFail:
- return Error(IDLoc, "unrecognized instruction mnemonic");
- case Match_InvalidOperand: {
- if (ErrorInfo != ~0ULL) {
- if (ErrorInfo >= Operands.size())
- return Error(IDLoc, "too few operands for instruction");
-
- }
- return Error(IDLoc, "invalid operand for instruction");
+ case Match_Success:
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, STI);
+ return false;
+ case Match_MissingFeature:
+ return Error(IDLoc, "instruction use requires an option to be enabled");
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction mnemonic");
+ case Match_InvalidOperand: {
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
}
+ return Error(IDLoc, "invalid operand for instruction");
+ }
}
llvm_unreachable("Implement any new match types added!");
}
@@ -312,6 +311,7 @@ bool AMDGPUOperand::isSWaitCnt() const {
/// Force static initialization.
extern "C" void LLVMInitializeR600AsmParser() {
RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
+ RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget);
}
#define GET_REGISTER_MATCHER
diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td
new file mode 100644
index 0000000..3ac7af8
--- /dev/null
+++ b/lib/Target/R600/CIInstructions.td
@@ -0,0 +1,42 @@
+//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+
+
+def isCIVI : Predicate <
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isCIVI in {
+
+defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
+ VOP_F64_F64, ftrunc
+>;
+defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
+ VOP_F64_F64, fceil
+>;
+defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
+ VOP_F64_F64, ffloor
+>;
+defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
+ VOP_F64_F64, frint
+>;
+defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
+ VOP_F32_F32
+>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
+ VOP_F32_F32
+>;
+} // End SubtargetPredicate = isCIVI
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index ed0a216..5a4bae2 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
SIFixSGPRLiveRanges.cpp
+ SIFoldOperands.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
@@ -50,6 +51,7 @@ add_llvm_target(R600CodeGen
SILowerControlFlow.cpp
SILowerI1Copies.cpp
SIMachineFunctionInfo.cpp
+ SIPrepareScratchRegs.cpp
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SITypeRewriter.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index 58b5ce2..ba4df82 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -12,7 +12,7 @@
//
//===----------------------------------------------------------------------===//
-def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
+def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
//===----------------------------------------------------------------------===//
// Cayman Instructions
@@ -46,7 +46,7 @@ def SIN_cm : SIN_Common<0x8D>;
def COS_cm : COS_Common<0x8E>;
} // End isVector = 1
-defm : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index f24f76b..9f9472c 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -14,14 +14,14 @@
//===----------------------------------------------------------------------===//
def isEG : Predicate<
- "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
- "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "!Subtarget.hasCaymanISA()"
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+ "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "!Subtarget->hasCaymanISA()"
>;
def isEGorCayman : Predicate<
- "Subtarget.getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
- "Subtarget.getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+ "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
>;
//===----------------------------------------------------------------------===//
@@ -69,7 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-defm : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;
@@ -590,8 +590,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
// SHA-256 Patterns
def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
-def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
-
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
let Word1{20} = 0; // VALID_PIXEL_MODE
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 64fe726..b66ed10 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -9,11 +9,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUInstPrinter.h"
-#include "SIDefines.h"
-
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/MathExtras.h"
@@ -74,7 +74,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
O << " offset:";
- printU16ImmOperand(MI, OpNo, O);
+ printU16ImmDecOperand(MI, OpNo, O);
}
}
@@ -208,7 +208,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
}
-void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
if (SImm >= -16 && SImm <= 64) {
O << SImm;
@@ -233,9 +233,37 @@ void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
O << "4.0";
else if (Imm == FloatToBits(-4.0f))
O << "-4.0";
- else {
+ else
O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
+ int64_t SImm = static_cast<int64_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
+ return;
}
+
+ if (Imm == DoubleToBits(0.0))
+ O << "0.0";
+ else if (Imm == DoubleToBits(1.0))
+ O << "1.0";
+ else if (Imm == DoubleToBits(-1.0))
+ O << "-1.0";
+ else if (Imm == DoubleToBits(0.5))
+ O << "0.5";
+ else if (Imm == DoubleToBits(-0.5))
+ O << "-0.5";
+ else if (Imm == DoubleToBits(2.0))
+ O << "2.0";
+ else if (Imm == DoubleToBits(-2.0))
+ O << "-2.0";
+ else if (Imm == DoubleToBits(4.0))
+ O << "4.0";
+ else if (Imm == DoubleToBits(-4.0))
+ O << "-4.0";
+ else
+ llvm_unreachable("64-bit literal constants not supported");
}
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -253,14 +281,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
break;
}
} else if (Op.isImm()) {
- printImmediate(Op.getImm(), O);
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ int RCID = Desc.OpInfo[OpNo].RegClass;
+ if (RCID != -1) {
+ const MCRegisterClass &ImmRC = MRI.getRegClass(RCID);
+ if (ImmRC.getSize() == 4)
+ printImmediate32(Op.getImm(), O);
+ else if (ImmRC.getSize() == 8)
+ printImmediate64(Op.getImm(), O);
+ else
+ llvm_unreachable("Invalid register class size");
+ } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
+ printImmediate32(Op.getImm(), O);
+ } else {
+ // We hit this for the immediate instruction bits that don't yet have a
+ // custom printer.
+ // TODO: Eventually this should be unnecessary.
+ O << formatDec(Op.getImm());
+ }
} else if (Op.isFPImm()) {
-
// We special case 0.0 because otherwise it will be printed as an integer.
if (Op.getFPImm() == 0.0)
O << "0.0";
- else
- printImmediate(FloatToBits(Op.getFPImm()), O);
+ else {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass);
+
+ if (ImmRC.getSize() == 4)
+ printImmediate32(FloatToBits(Op.getFPImm()), O);
+ else if (ImmRC.getSize() == 8)
+ printImmediate64(DoubleToBits(Op.getFPImm()), O);
+ else
+ llvm_unreachable("Invalid register class size");
+ }
} else if (Op.isExpr()) {
const MCExpr *Exp = Op.getExpr();
Exp->print(O);
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 4c06ac0..1d43c7a 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -48,7 +48,8 @@ private:
void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
- void printImmediate(uint32_t Imm, raw_ostream &O);
+ void printImmediate32(uint32_t I, raw_ostream &O);
+ void printImmediate64(uint64_t I, raw_ostream &O);
void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 5fb311b..d0c634f 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -29,7 +29,7 @@ public:
const MCAsmLayout &Layout) override {
//XXX: Implement if necessary.
}
- void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
const MCFragment *Fragment, const MCFixup &Fixup,
MCValue Target, bool &IsPCRel,
uint64_t &FixedValue) override {
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 3c2b889..19d89fb 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -17,6 +17,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() {
MaxInstLength = 16;
SeparatorString = "\n";
CommentString = ";";
+ PrivateLabelPrefix = "";
InlineAsmStart = ";#ASMSTART";
InlineAsmEnd = ";#ASMEND";
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 8731055..83403ba 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUMCTargetDesc.h"
#include "AMDGPUMCAsmInfo.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIDefines.h"
#include "llvm/MC/MCCodeGenInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -92,20 +93,29 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
extern "C" void LLVMInitializeR600TargetMC() {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
+ RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget);
TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
+ TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo);
TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo);
TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
+ TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo);
TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
+ TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter);
TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter);
TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend);
TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
+ TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer);
}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index c019766..bc8cd53 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -30,6 +30,7 @@ class Target;
class raw_ostream;
extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index dc1344f..8a555ff 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -30,8 +30,8 @@ using namespace llvm;
namespace {
class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
- R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
- void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+ R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
+ void operator=(const R600MCCodeEmitter &) = delete;
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index 999fd0d..7e23772 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -14,10 +14,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "SIDefines.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCFixup.h"
@@ -31,15 +31,9 @@ using namespace llvm;
namespace {
-/// \brief Helper type used in encoding
-typedef union {
- int32_t I;
- float F;
-} IntFloatUnion;
-
class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
- SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
- void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+ SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
+ void operator=(const SIMCCodeEmitter &) = delete;
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
MCContext &Ctx;
@@ -48,7 +42,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
/// \brief Encode an fp or int literal
- uint32_t getLitEncoding(const MCOperand &MO) const;
+ uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const;
public:
SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
@@ -85,60 +79,107 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
unsigned OpNo) const {
- unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
- return (AMDGPU::SSrc_32RegClassID == RegClass) ||
- (AMDGPU::SSrc_64RegClassID == RegClass) ||
- (AMDGPU::VSrc_32RegClassID == RegClass) ||
- (AMDGPU::VSrc_64RegClassID == RegClass) ||
- (AMDGPU::VCSrc_32RegClassID == RegClass) ||
- (AMDGPU::VCSrc_64RegClassID == RegClass);
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+
+ return OpType == AMDGPU::OPERAND_REG_IMM32 ||
+ OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
-uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
+// Returns the encoding value to use if the given integer is an integer inline
+// immediate value, or 0 if it is not.
+template <typename IntTy>
+static uint32_t getIntInlineImmEncoding(IntTy Imm) {
+ if (Imm >= 0 && Imm <= 64)
+ return 128 + Imm;
- IntFloatUnion Imm;
- if (MO.isImm())
- Imm.I = MO.getImm();
- else if (MO.isFPImm())
- Imm.F = MO.getFPImm();
- else if (MO.isExpr())
- return 255;
- else
- return ~0;
+ if (Imm >= -16 && Imm <= -1)
+ return 192 + std::abs(Imm);
- if (Imm.I >= 0 && Imm.I <= 64)
- return 128 + Imm.I;
+ return 0;
+}
- if (Imm.I >= -16 && Imm.I <= -1)
- return 192 + abs(Imm.I);
+static uint32_t getLit32Encoding(uint32_t Val) {
+ uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
- if (Imm.F == 0.5f)
+ if (Val == FloatToBits(0.5f))
return 240;
- if (Imm.F == -0.5f)
+ if (Val == FloatToBits(-0.5f))
return 241;
- if (Imm.F == 1.0f)
+ if (Val == FloatToBits(1.0f))
return 242;
- if (Imm.F == -1.0f)
+ if (Val == FloatToBits(-1.0f))
return 243;
- if (Imm.F == 2.0f)
+ if (Val == FloatToBits(2.0f))
return 244;
- if (Imm.F == -2.0f)
+ if (Val == FloatToBits(-2.0f))
return 245;
- if (Imm.F == 4.0f)
+ if (Val == FloatToBits(4.0f))
return 246;
- if (Imm.F == -4.0f)
+ if (Val == FloatToBits(-4.0f))
return 247;
return 255;
}
+static uint32_t getLit64Encoding(uint64_t Val) {
+ uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == DoubleToBits(0.5))
+ return 240;
+
+ if (Val == DoubleToBits(-0.5))
+ return 241;
+
+ if (Val == DoubleToBits(1.0))
+ return 242;
+
+ if (Val == DoubleToBits(-1.0))
+ return 243;
+
+ if (Val == DoubleToBits(2.0))
+ return 244;
+
+ if (Val == DoubleToBits(-2.0))
+ return 245;
+
+ if (Val == DoubleToBits(4.0))
+ return 246;
+
+ if (Val == DoubleToBits(-4.0))
+ return 247;
+
+ return 255;
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
+ unsigned OpSize) const {
+ if (MO.isExpr())
+ return 255;
+
+ assert(!MO.isFPImm());
+
+ if (!MO.isImm())
+ return ~0;
+
+ if (OpSize == 4)
+ return getLit32Encoding(static_cast<uint32_t>(MO.getImm()));
+
+ assert(OpSize == 8);
+
+ return getLit64Encoding(static_cast<uint64_t>(MO.getImm()));
+}
+
void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -161,25 +202,24 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
if (!isSrcOperand(Desc, i))
continue;
+ int RCID = Desc.OpInfo[i].RegClass;
+ const MCRegisterClass &RC = MRI.getRegClass(RCID);
+
// Is this operand a literal immediate?
const MCOperand &Op = MI.getOperand(i);
- if (getLitEncoding(Op) != 255)
+ if (getLitEncoding(Op, RC.getSize()) != 255)
continue;
// Yes! Encode it
- IntFloatUnion Imm;
+ int64_t Imm = 0;
+
if (Op.isImm())
- Imm.I = Op.getImm();
- else if (Op.isFPImm())
- Imm.F = Op.getFPImm();
- else {
- assert(Op.isExpr());
- // This will be replaced with a fixup value.
- Imm.I = 0;
- }
+ Imm = Op.getImm();
+ else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+ llvm_unreachable("Must be immediate or expr");
for (unsigned j = 0; j < 4; j++) {
- OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
+ OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
}
// Only one literal value allowed
@@ -234,7 +274,10 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (isSrcOperand(Desc, OpNo)) {
- uint32_t Enc = getLitEncoding(MO);
+ int RCID = Desc.OpInfo[OpNo].RegClass;
+ const MCRegisterClass &RC = MRI.getRegClass(RCID);
+
+ uint32_t Enc = getLitEncoding(MO, RC.getSize());
if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
return Enc;
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index ce17d7c..fb5aa61 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -83,28 +83,44 @@ def : Proc<"cayman", R600_VLIW4_Itin,
// Southern Islands
//===----------------------------------------------------------------------===//
-def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"SI", SIFullSpeedModel,
+ [FeatureSouthernIslands, FeatureFastFMAF32]
+>;
-def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"tahiti", SIFullSpeedModel,
+ [FeatureSouthernIslands, FeatureFastFMAF32]
+>;
-def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
//===----------------------------------------------------------------------===//
// Sea Islands
//===----------------------------------------------------------------------===//
-def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"hawaii", SIFullSpeedModel,
+ [FeatureSeaIslands, FeatureFastFMAF32]
+>;
-def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index edaf278..c8f37f6 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -39,14 +39,14 @@ struct CFStack {
FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
};
- const AMDGPUSubtarget &ST;
+ const AMDGPUSubtarget *ST;
std::vector<StackItem> BranchStack;
std::vector<StackItem> LoopStack;
unsigned MaxStackSize;
unsigned CurrentEntries;
unsigned CurrentSubEntries;
- CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
+ CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
// We need to reserve a stack entry for CALL_FS in vertex shaders.
MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
CurrentEntries(0), CurrentSubEntries(0) { }
@@ -76,11 +76,11 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
}
bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
- if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
+ if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
getLoopDepth() > 1)
return true;
- if (!ST.hasCFAluBug())
+ if (!ST->hasCFAluBug())
return false;
switch(Opcode) {
@@ -91,7 +91,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
case AMDGPU::CF_ALU_CONTINUE:
if (CurrentSubEntries == 0)
return false;
- if (ST.getWavefrontSize() == 64) {
+ if (ST->getWavefrontSize() == 64) {
// We are being conservative here. We only require this work-around if
// CurrentSubEntries > 3 &&
// (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
@@ -102,7 +102,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
// resources without any problems.
return CurrentSubEntries > 3;
} else {
- assert(ST.getWavefrontSize() == 32);
+ assert(ST->getWavefrontSize() == 32);
// We are being conservative here. We only require the work-around if
// CurrentSubEntries > 7 &&
// (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
@@ -118,8 +118,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
default:
return 0;
case CFStack::FIRST_NON_WQM_PUSH:
- assert(!ST.hasCaymanISA());
- if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
+ assert(!ST->hasCaymanISA());
+ if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
// +1 For the push operation.
// +2 Extra space required.
return 3;
@@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
return 2;
}
case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
- assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+ assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
// +1 For the push operation.
// +1 Extra space required.
return 2;
@@ -153,13 +153,14 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
case AMDGPU::CF_PUSH_EG:
case AMDGPU::CF_ALU_PUSH_BEFORE:
if (!isWQM) {
- if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+ if (!ST->hasCaymanISA() &&
+ !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
// See comment in
// CFStack::getSubEntrySize()
else if (CurrentEntries > 0 &&
- ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
- !ST.hasCaymanISA() &&
+ ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+ !ST->hasCaymanISA() &&
!branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
else
@@ -219,7 +220,7 @@ private:
const R600InstrInfo *TII;
const R600RegisterInfo *TRI;
unsigned MaxFetchInst;
- const AMDGPUSubtarget &ST;
+ const AMDGPUSubtarget *ST;
bool IsTrivialInst(MachineInstr *MI) const {
switch (MI->getOpcode()) {
@@ -233,7 +234,7 @@ private:
const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
unsigned Opcode = 0;
- bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+ bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
switch (CFI) {
case CF_TC:
Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
@@ -266,7 +267,7 @@ private:
Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
break;
case CF_END:
- if (ST.hasCaymanISA()) {
+ if (ST->hasCaymanISA()) {
Opcode = AMDGPU::CF_END_CM;
break;
}
@@ -467,17 +468,14 @@ private:
}
public:
- R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
- TII (nullptr), TRI(nullptr),
- ST(tm.getSubtarget<AMDGPUSubtarget>()) {
- const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
- MaxFetchInst = ST.getTexVTXClauseSize();
- }
+ R600ControlFlowFinalizer(TargetMachine &tm)
+ : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
bool runOnMachineFunction(MachineFunction &MF) override {
- TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
- TRI = static_cast<const R600RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
+ ST = &MF.getSubtarget<AMDGPUSubtarget>();
+ MaxFetchInst = ST->getTexVTXClauseSize();
+ TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
+ TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
CFStack CFStack(ST, MFI->getShaderType());
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index a214e53..c738611 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -30,9 +30,9 @@
using namespace llvm;
-R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
- AMDGPUTargetLowering(TM),
- Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
+R600TargetLowering::R600TargetLowering(TargetMachine &TM,
+ const AMDGPUSubtarget &STI)
+ : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
@@ -40,7 +40,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
- computeRegisterProperties();
+ computeRegisterProperties(STI.getRegisterInfo());
// Set condition code actions
setCondCodeAction(ISD::SETO, MVT::f32, Expand);
@@ -122,12 +122,19 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
- setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+ }
setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
@@ -181,8 +188,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SUBE, VT, Expand);
}
- setBooleanContents(ZeroOrNegativeOneBooleanContent);
- setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::Source);
}
@@ -192,7 +197,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock::iterator I = *MI;
const R600InstrInfo *TII =
- static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
+ static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
switch (MI->getOpcode()) {
default:
@@ -647,9 +652,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
MachineSDNode *interp;
if (ijb < 0) {
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
- MF.getSubtarget().getInstrInfo());
+ const R600InstrInfo *TII =
+ static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
return DAG.getTargetExtractSubreg(
@@ -1115,6 +1119,13 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
SDValue CC = Op.getOperand(4);
SDValue Temp;
+ if (VT == MVT::f32) {
+ DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+ SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+ if (MinMax)
+ return MinMax;
+ }
+
// LHS and RHS are guaranteed to be the same value type
EVT CompareVT = LHS.getValueType();
@@ -1369,8 +1380,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
- getTargetMachine().getSubtargetImpl()->getFrameLowering());
+ const AMDGPUFrameLowering *TFL =
+ static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
unsigned StackWidth = TFL->getStackWidth(MF);
Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1567,8 +1578,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
- getTargetMachine().getSubtargetImpl()->getFrameLowering());
+ const AMDGPUFrameLowering *TFL =
+ static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
unsigned StackWidth = TFL->getStackWidth(MF);
Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1682,7 +1693,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
// XXX - I think PartOffset should give you this, but it seems to give the
// size of the register which isn't useful.
- unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
+ unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
unsigned Offset = 36 + VA.getLocMemOffset();
@@ -2172,9 +2183,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
unsigned Opcode = Node->getMachineOpcode();
SDValue FakeOp;
- std::vector<SDValue> Ops;
- for (const SDUse &I : Node->ops())
- Ops.push_back(I);
+ std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
if (Opcode == AMDGPU::DOT_4) {
int OperandIdx[] = {
@@ -2236,10 +2245,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
AMDGPU::OpName::clamp);
if (ClampIdx < 0)
return Node;
- std::vector<SDValue> Ops;
- unsigned NumOp = Src.getNumOperands();
- for(unsigned i = 0; i < NumOp; ++i)
- Ops.push_back(Src.getOperand(i));
+ std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
Node->getVTList(), Ops);
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 10ebc10..c547195 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -23,7 +23,7 @@ class R600InstrInfo;
class R600TargetLowering : public AMDGPUTargetLowering {
public:
- R600TargetLowering(TargetMachine &TM);
+ R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock * BB) const override;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index b6c00f8..291fb04 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -335,10 +335,11 @@ def load_param : LoadParamFrag<load>;
def load_param_exti8 : LoadParamFrag<az_extloadi8>;
def load_param_exti16 : LoadParamFrag<az_extloadi16>;
-def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">;
+def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">;
-def isR600toCayman : Predicate<
- "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
+def isR600toCayman
+ : Predicate<
+ "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
//===----------------------------------------------------------------------===//
// R600 SDNodes
@@ -579,6 +580,7 @@ i32imm:$COUNT, i32imm:$Enabled),
let ALT_CONST = 0;
let WHOLE_QUAD_MODE = 0;
let BARRIER = 1;
+ let isCodeGenOnly = 1;
let UseNamedOperandTable = 1;
let Inst{31-0} = Word0;
@@ -641,6 +643,7 @@ def FETCH_CLAUSE : AMDGPUInst <(outs),
field bits<8> Inst;
bits<8> num;
let Inst = num;
+ let isCodeGenOnly = 1;
}
def ALU_CLAUSE : AMDGPUInst <(outs),
@@ -648,10 +651,13 @@ def ALU_CLAUSE : AMDGPUInst <(outs),
field bits<8> Inst;
bits<8> num;
let Inst = num;
+ let isCodeGenOnly = 1;
}
def LITERALS : AMDGPUInst <(outs),
(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+ let isCodeGenOnly = 1;
+
field bits<64> Inst;
bits<32> literal1;
bits<32> literal2;
@@ -698,7 +704,7 @@ def SGE : R600_2OP <
def SNE : R600_2OP <
0xB, "SETNE",
- [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
>;
def SETE_DX10 : R600_2OP <
@@ -716,9 +722,10 @@ def SETGE_DX10 : R600_2OP <
[(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
>;
+// FIXME: This should probably be COND_ONE
def SETNE_DX10 : R600_2OP <
0xF, "SETNE_DX10",
- [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
>;
def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -913,7 +920,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP <
class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
inst, "MULADD_IEEE",
- [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+ [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
>;
class FMA_Common <bits<5> inst> : R600_3OP <
@@ -1141,16 +1148,6 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
(exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
>;
-// FROUND pattern
-class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
- (AMDGPUround f32:$x),
- (CNDGE $x,
- (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
- (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
- )
->;
-
-
//===----------------------------------------------------------------------===//
// R600 / R700 Instructions
//===----------------------------------------------------------------------===//
@@ -1192,9 +1189,7 @@ let Predicates = [isR600] in {
def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
- defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
-
- def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
+ def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT
@@ -1248,6 +1243,7 @@ let Predicates = [isR600] in {
def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
"PUSH_ELSE @$ADDR"> {
let CNT = 0;
+ let POP_COUNT = 0; // FIXME?
}
def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
"ELSE @$ADDR POP:$POP_COUNT"> {
@@ -1364,7 +1360,7 @@ def CONST_COPY : Instruction {
let Pattern =
[(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
let AsmString = "CONST_COPY";
- let neverHasSideEffects = 1;
+ let hasSideEffects = 0;
let isAsCheapAsAMove = 1;
let Itinerary = NullALU;
}
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d782713..bcde5fb 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -16,7 +16,7 @@
#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -26,17 +26,16 @@ using namespace llvm;
void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
DAG = static_cast<ScheduleDAGMILive*>(dag);
+ const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
TII = static_cast<const R600InstrInfo*>(DAG->TII);
TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
- VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+ VLIW5 = !ST.hasCaymanISA();
MRI = &DAG->MRI;
CurInstKind = IDOther;
CurEmitted = 0;
OccupedSlotsMask = 31;
InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
InstKindLimit[IDOther] = 32;
-
- const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
AluInstCount = 0;
FetchInstCount = 0;
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index ddf68c9..deee5bc 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -153,7 +153,7 @@ public:
TII(static_cast<const R600InstrInfo *>(
MF.getSubtarget().getInstrInfo())),
TRI(TII->getRegisterInfo()) {
- VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+ VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
}
// initPacketizerState - initialize some internal flags.
diff --git a/lib/Target/R600/R700Instructions.td b/lib/Target/R600/R700Instructions.td
index 9aad85d..613a0d7 100644
--- a/lib/Target/R600/R700Instructions.td
+++ b/lib/Target/R600/R700Instructions.td
@@ -13,7 +13,7 @@
//
//===----------------------------------------------------------------------===//
-def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
+def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">;
let Predicates = [isR700] in {
def SIN_r700 : SIN_Common<0x6E>;
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index 91eb60b..79f6532 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -14,6 +14,7 @@
#include "AMDGPU.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
@@ -66,6 +67,8 @@ class SIAnnotateControlFlow : public FunctionPass {
DominatorTree *DT;
StackVector Stack;
+ LoopInfo *LI;
+
bool isTopOfStack(BasicBlock *BB);
Value *popSaved();
@@ -99,6 +102,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
@@ -277,10 +281,26 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
push(Term->getSuccessor(0), Arg);
-}
-
-/// \brief Close the last opened control flow
+}/// \brief Close the last opened control flow
void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+ llvm::Loop *L = LI->getLoopFor(BB);
+
+ if (L && L->getHeader() == BB) {
+ // We can't insert an EndCF call into a loop header, because it will
+ // get executed on every iteration of the loop, when it should be
+ // executed only once before the loop.
+ SmallVector <BasicBlock*, 8> Latches;
+ L->getLoopLatches(Latches);
+
+ std::vector<BasicBlock*> Preds;
+ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+ if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
+ Preds.push_back(*PI);
+ }
+ BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
+ LI, false);
+ }
+
CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
}
@@ -288,6 +308,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
/// recognize if/then/else and loops.
bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 2e7dab6..b540140 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -8,25 +8,49 @@
/// \file
//===----------------------------------------------------------------------===//
+#include "llvm/MC/MCInstrDesc.h"
+
#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
#define LLVM_LIB_TARGET_R600_SIDEFINES_H
namespace SIInstrFlags {
// This needs to be kept in sync with the field bits in InstSI.
enum {
- MIMG = 1 << 3,
- SMRD = 1 << 4,
- VOP1 = 1 << 5,
- VOP2 = 1 << 6,
- VOP3 = 1 << 7,
- VOPC = 1 << 8,
- SALU = 1 << 9,
- MUBUF = 1 << 10,
- MTBUF = 1 << 11,
- FLAT = 1 << 12
+ SALU = 1 << 3,
+ VALU = 1 << 4,
+
+ SOP1 = 1 << 5,
+ SOP2 = 1 << 6,
+ SOPC = 1 << 7,
+ SOPK = 1 << 8,
+ SOPP = 1 << 9,
+
+ VOP1 = 1 << 10,
+ VOP2 = 1 << 11,
+ VOP3 = 1 << 12,
+ VOPC = 1 << 13,
+
+ MUBUF = 1 << 14,
+ MTBUF = 1 << 15,
+ SMRD = 1 << 16,
+ DS = 1 << 17,
+ MIMG = 1 << 18,
+ FLAT = 1 << 19,
+ WQM = 1 << 20
};
}
+namespace llvm {
+namespace AMDGPU {
+ enum OperandType {
+ /// Operand with register or 32-bit immediate
+ OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
+ /// Operand with register or inline constant
+ OPERAND_REG_INLINE_C
+ };
+}
+}
+
namespace SIInstrFlags {
enum Flags {
// First 4 bits are the instruction encoding
@@ -34,6 +58,21 @@ namespace SIInstrFlags {
EXP_CNT = 1 << 1,
LGKM_CNT = 1 << 2
};
+
+ // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+ // The result is true if any of these tests are true.
+ enum ClassFlags {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
+ };
}
namespace SISrcMods {
@@ -61,7 +100,14 @@ namespace SIOutMods {
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
-#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0)
+#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0)
+#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1)
+#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7)
+#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8)
+#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9)
+#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10)
+#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11)
+
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
@@ -118,4 +164,8 @@ namespace SIOutMods {
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
+#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+
+
#endif
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index d6f4b4c..cd1b3ac 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -136,12 +136,12 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
const MachineRegisterInfo &MRI,
unsigned Reg,
unsigned SubReg) const {
- // The Reg parameter to the function must always be defined by either a PHI
- // or a COPY, therefore it cannot be a physical register.
- assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
- "Reg cannot be a physical register");
- const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ const TargetRegisterClass *RC
+ = TargetRegisterInfo::isVirtualRegister(Reg) ?
+ MRI.getRegClass(Reg) :
+ TRI->getRegClass(Reg);
+
RC = TRI->getSubRegClass(RC, SubReg);
for (MachineRegisterInfo::use_instr_iterator
I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
@@ -182,7 +182,12 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
unsigned DstReg = Copy.getOperand(0).getReg();
unsigned SrcReg = Copy.getOperand(1).getReg();
unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
- const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+
+ const TargetRegisterClass *DstRC
+ = TargetRegisterInfo::isVirtualRegister(DstReg) ?
+ MRI.getRegClass(DstReg) :
+ TRI->getRegClass(DstReg);
+
const TargetRegisterClass *SrcRC;
if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
@@ -217,20 +222,21 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default: continue;
case AMDGPU::PHI: {
- DEBUG(dbgs() << " Fixing PHI:\n");
- DEBUG(MI.print(dbgs()));
+ DEBUG(dbgs() << "Fixing PHI: " << MI);
- for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
- unsigned Reg = MI.getOperand(i).getReg();
- const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
- MI.getOperand(0).getSubReg());
- MRI.constrainRegClass(Reg, RC);
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ const MachineOperand &Op = MI.getOperand(i);
+ unsigned Reg = Op.getReg();
+ const TargetRegisterClass *RC
+ = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+
+ MRI.constrainRegClass(Op.getReg(), RC);
}
unsigned Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
MI.getOperand(0).getSubReg());
- if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
- MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+ if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
+ MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
}
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
new file mode 100644
index 0000000..ae4b05d
--- /dev/null
+++ b/lib/Target/R600/SIFoldOperands.cpp
@@ -0,0 +1,287 @@
+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-fold-operands"
+using namespace llvm;
+
+namespace {
+
+class SIFoldOperands : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFoldOperands() : MachineFunctionPass(ID) {
+ initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "SI Fold Operands";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+struct FoldCandidate {
+ MachineInstr *UseMI;
+ unsigned UseOpNo;
+ MachineOperand *OpToFold;
+ uint64_t ImmToFold;
+
+ FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+ UseMI(MI), UseOpNo(OpNo) {
+
+ if (FoldOp->isImm()) {
+ OpToFold = nullptr;
+ ImmToFold = FoldOp->getImm();
+ } else {
+ assert(FoldOp->isReg());
+ OpToFold = FoldOp;
+ }
+ }
+
+ bool isImm() const {
+ return !OpToFold;
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
+ "SI Fold Operands", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
+ "SI Fold Operands", false, false)
+
+char SIFoldOperands::ID = 0;
+
+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+
+FunctionPass *llvm::createSIFoldOperandsPass() {
+ return new SIFoldOperands();
+}
+
+static bool isSafeToFold(unsigned Opcode) {
+ switch(Opcode) {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::S_MOV_B32:
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::COPY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool updateOperand(FoldCandidate &Fold,
+ const TargetRegisterInfo &TRI) {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ assert(Old.isReg());
+
+ if (Fold.isImm()) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
+
+ MachineOperand *New = Fold.OpToFold;
+ if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
+ TargetRegisterInfo::isVirtualRegister(New->getReg())) {
+ Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+ return true;
+ }
+
+ // FIXME: Handle physical registers.
+
+ return false;
+}
+
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+ MachineInstr *MI, unsigned OpNo,
+ MachineOperand *OpToFold,
+ const SIInstrInfo *TII) {
+ if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+ // Operand is not legal, so try to commute the instruction to
+ // see if this makes it possible to fold.
+ unsigned CommuteIdx0;
+ unsigned CommuteIdx1;
+ bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+
+ if (CanCommute) {
+ if (CommuteIdx0 == OpNo)
+ OpNo = CommuteIdx1;
+ else if (CommuteIdx1 == OpNo)
+ OpNo = CommuteIdx0;
+ }
+
+ if (!CanCommute || !TII->commuteInstruction(MI))
+ return false;
+
+ if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+ return false;
+ }
+
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+}
+
+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ if (!isSafeToFold(MI.getOpcode()))
+ continue;
+
+ unsigned OpSize = TII->getOpSize(MI, 1);
+ MachineOperand &OpToFold = MI.getOperand(1);
+ bool FoldingImm = OpToFold.isImm();
+
+ // FIXME: We could also be folding things like FrameIndexes and
+ // TargetIndexes.
+ if (!FoldingImm && !OpToFold.isReg())
+ continue;
+
+ // Folding immediates with more than one use will increase program size.
+ // FIXME: This will also reduce register usage, which may be better
+ // in some cases. A better heuristic is needed.
+ if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
+ !MRI.hasOneUse(MI.getOperand(0).getReg()))
+ continue;
+
+ // FIXME: Fold operands with subregs.
+ if (OpToFold.isReg() &&
+ (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
+ OpToFold.getSubReg()))
+ continue;
+
+ std::vector<FoldCandidate> FoldList;
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
+ Use != E; ++Use) {
+
+ MachineInstr *UseMI = Use->getParent();
+ const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
+
+ // FIXME: Fold operands with subregs.
+ if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+ UseOp.isImplicit())) {
+ continue;
+ }
+
+ APInt Imm;
+
+ if (FoldingImm) {
+ unsigned UseReg = UseOp.getReg();
+ const TargetRegisterClass *UseRC
+ = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+ MRI.getRegClass(UseReg) :
+ TRI.getRegClass(UseReg);
+
+ Imm = APInt(64, OpToFold.getImm());
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (UseOp.getSubReg()) {
+ if (UseRC->getSize() != 8)
+ continue;
+
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
+ }
+ }
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (UseMI->getOpcode() == AMDGPU::COPY) {
+ unsigned DestReg = UseMI->getOperand(0).getReg();
+ const TargetRegisterClass *DestRC
+ = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+ MRI.getRegClass(DestReg) :
+ TRI.getRegClass(DestReg);
+
+ unsigned MovOp = TII->getMovOpcode(DestRC);
+ if (MovOp == AMDGPU::COPY)
+ continue;
+
+ UseMI->setDesc(TII->get(MovOp));
+ }
+ }
+
+ const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+ // Don't fold into target independent nodes. Target independent opcodes
+ // don't have defined register classes.
+ if (UseDesc.isVariadic() ||
+ UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
+ continue;
+
+ if (FoldingImm) {
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
+ continue;
+ }
+
+ tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
+
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunites. The shrink operands pass
+ // already does this.
+ }
+
+ for (FoldCandidate &Fold : FoldList) {
+ if (updateOperand(Fold, TRI)) {
+ // Clear kill flags.
+ if (!Fold.isImm()) {
+ assert(Fold.OpToFold && Fold.OpToFold->isReg());
+ Fold.OpToFold->setIsKill(false);
+ }
+ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+ Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
+ }
+ }
+ }
+ }
+ return false;
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 8d4164a..7d794b8 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -35,8 +35,9 @@
using namespace llvm;
-SITargetLowering::SITargetLowering(TargetMachine &TM) :
- AMDGPUTargetLowering(TM) {
+SITargetLowering::SITargetLowering(TargetMachine &TM,
+ const AMDGPUSubtarget &STI)
+ : AMDGPUTargetLowering(TM, STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
@@ -44,7 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+ addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
@@ -59,22 +60,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
- computeRegisterProperties();
-
- // Condition Codes
- setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
-
- setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
- setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+ computeRegisterProperties(STI.getRegisterInfo());
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
@@ -104,12 +90,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::i32, Custom);
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
- setOperationAction(ISD::SELECT, MVT::f32, Promote);
- AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -147,26 +129,34 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
-
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
-
- setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-
- setTruncStoreAction(MVT::i32, MVT::i8, Custom);
- setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ if (VT == MVT::i64)
+ continue;
+
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
+ }
+
+ for (MVT VT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
@@ -213,13 +203,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
}
}
- for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) {
- MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I);
- setOperationAction(ISD::FTRUNC, VT, Expand);
- setOperationAction(ISD::FCEIL, VT, Expand);
- setOperationAction(ISD::FFLOOR, VT, Expand);
- }
-
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -228,6 +211,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
}
setOperationAction(ISD::FDIV, MVT::f32, Custom);
+ setOperationAction(ISD::FDIV, MVT::f64, Custom);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
@@ -235,7 +219,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
-
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::UINT_TO_FP);
// All memory operations. Some folding on the pointer operand is done to help
@@ -315,7 +300,7 @@ bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
return true;
}
-bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
unsigned Align,
bool *IsFast) const {
@@ -327,9 +312,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
if (!VT.isSimple() || VT == MVT::Other)
return false;
- // XXX - CI changes say "Support for unaligned memory accesses" but I don't
- // see what for specifically. The wording everywhere else seems to be the
- // same.
+ // TODO - CI+ supports unaligned memory accesses, but this requires driver
+ // support.
// XXX - The only mention I see of this in the ISA manual is for LDS direct
// reads the "byte address and must be dword aligned". Is it also true for the
@@ -341,12 +325,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return Align % 4 == 0;
}
+ // Smaller than dword value must be aligned.
+ // FIXME: This should be allowed on CI+
+ if (VT.bitsLT(MVT::i32))
+ return false;
+
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
// byte-address are ignored, thus forcing Dword alignment.
// This applies to private, global, and constant memory.
if (IsFast)
*IsFast = true;
- return VT.bitsGT(MVT::i32);
+
+ return VT.bitsGT(MVT::i32) && Align % 4 == 0;
}
EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
@@ -379,8 +369,8 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
return TII->isInlineConstant(Imm);
}
@@ -413,16 +403,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
}
SDValue SITargetLowering::LowerFormalArguments(
- SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
-
- const TargetMachine &TM = getTargetMachine();
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
MachineFunction &MF = DAG.getMachineFunction();
FunctionType *FType = MF.getFunction()->getFunctionType();
@@ -461,7 +446,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
// NOT four or eigth.
- Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+ Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
for (unsigned j = 0; j != NumElements; ++j) {
@@ -489,7 +474,10 @@ SDValue SITargetLowering::LowerFormalArguments(
// The pointer to the list of arguments is stored in SGPR0, SGPR1
// The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
- Info->NumUserSGPRs = 4;
+ if (Subtarget->isAmdHsaOS())
+ Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
+ else
+ Info->NumUserSGPRs = 4;
unsigned InputPtrReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
@@ -541,7 +529,7 @@ SDValue SITargetLowering::LowerFormalArguments(
Offset, Ins[i].Flags.isSExt());
const PointerType *ParamTy =
- dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
+ dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// On SI local pointers are just offsets into LDS, so they are always
@@ -576,7 +564,7 @@ SDValue SITargetLowering::LowerFormalArguments(
if (Arg.VT.isVector()) {
// Build a vector from the registers
- Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+ Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
SmallVector<SDValue, 4> Regs;
@@ -589,8 +577,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// Fill up the missing vector elements
NumElements = Arg.VT.getVectorNumElements() - NumElements;
- for (unsigned j = 0; j != NumElements; ++j)
- Regs.push_back(DAG.getUNDEF(VT));
+ Regs.append(NumElements, DAG.getUNDEF(VT));
InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
continue;
@@ -598,6 +585,12 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
+
+ if (Info->getShaderType() != ShaderType::COMPUTE) {
+ unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
+ AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
+ Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+ }
return Chain;
}
@@ -605,25 +598,14 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineBasicBlock::iterator I = *MI;
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
- case AMDGPU::BRANCH: return BB;
- case AMDGPU::V_SUB_F64: {
- unsigned DestReg = MI->getOperand(0).getReg();
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
- .addImm(0) // SRC0 modifiers
- .addReg(MI->getOperand(1).getReg())
- .addImm(1) // SRC1 modifiers
- .addReg(MI->getOperand(2).getReg())
- .addImm(0) // CLAMP
- .addImm(0); // OMOD
- MI->eraseFromParent();
- break;
- }
+ case AMDGPU::BRANCH:
+ return BB;
case AMDGPU::SI_RegisterStorePseudo: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -640,17 +622,43 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
return BB;
}
-EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+ // This currently forces unfolding various combinations of fsub into fma with
+ // free fneg'd operands. As long as we have fast FMA (controlled by
+ // isFMAFasterThanFMulAndFAdd), we should perform these.
+
+ // When fma is quarter rate, for f64 where add / sub are at best half rate,
+ // most of these combines appear to be cycle neutral but save on instruction
+ // count / code size.
+ return true;
+}
+
+EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
if (!VT.isVector()) {
return MVT::i1;
}
- return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
}
MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
return MVT::i32;
}
+// Answering this is somewhat tricky and depends on the specific device which
+// have different rates for fma or all f64 operations.
+//
+// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
+// regardless of which device (although the number of cycles differs between
+// devices), so it is always profitable for f64.
+//
+// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
+// only on full rate devices. Normally, we should prefer selecting v_mad_f32
+// which we can always do even without fused FP ops since it returns the same
+// result as the separate operations and since it is always full
+// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
+// however does not support denormals, so we do report fma as faster if we have
+// a fast fma device and require denormals.
+//
bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
@@ -659,7 +667,11 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
- return false; /* There is V_MAD_F32 for f32 */
+ // This is as fast on some subtargets. However, we always have full rate f32
+ // mad available which returns the same result as the separate operations
+ // which we should prefer over fma. We can't use this if we want to support
+ // denormals, so only report this in these cases.
+ return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
case MVT::f64:
return true;
default:
@@ -755,15 +767,12 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
// Build the result and
- SmallVector<EVT, 4> Res;
- for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
- Res.push_back(Intr->getValueType(i));
+ ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
// operands of the new intrinsic call
SmallVector<SDValue, 4> Ops;
Ops.push_back(BRCOND.getOperand(0));
- for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
- Ops.push_back(Intr->getOperand(i));
+ Ops.append(Intr->op_begin() + 1, Intr->op_end());
Ops.push_back(Target);
// build the new intrinsic call
@@ -839,7 +848,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo());
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -889,13 +898,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
@@ -1090,7 +1099,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const APFloat K1Val(BitsToFloat(0x2f800000));
const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
- const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
@@ -1108,7 +1117,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
- return SDValue();
+ if (DAG.getTarget().Options.UnsafeFPMath)
+ return LowerFastFDIV(Op, DAG);
+
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+
+ const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+ SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
+
+ SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
+
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
+
+ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
+
+ SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
+
+ SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
+
+ SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
+
+ SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
+
+ SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
+
+ SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
+ NegDivScale0, Mul, DivScale1);
+
+ SDValue Scale;
+
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ // Workaround a hardware bug on SI where the condition output from div_scale
+ // is not usable.
+
+ const SDValue Hi = DAG.getConstant(1, MVT::i32);
+
+ // Figure out if the scale to use for div_fmas.
+ SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+ SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
+ SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
+ SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
+
+ SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
+ SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
+
+ SDValue Scale0Hi
+ = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
+ SDValue Scale1Hi
+ = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
+
+ SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
+ SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
+ Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
+ } else {
+ Scale = DivScale1.getValue(1);
+ }
+
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
+ Fma4, Fma3, Mul, Scale);
+
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
}
SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
@@ -1129,11 +1201,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Store->getMemoryVT();
// These stores are legal.
- if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- VT.isVector() && VT.getVectorNumElements() == 2 &&
- VT.getVectorElementType() == MVT::i32)
- return SDValue();
-
if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
if (VT.isVector() && VT.getVectorNumElements() > 4)
return ScalarizeVectorStore(Op, DAG);
@@ -1177,7 +1244,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
//===----------------------------------------------------------------------===//
SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
- DAGCombinerInfo &DCI) {
+ DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
EVT ScalarVT = VT.getScalarType();
if (ScalarVT != MVT::f32)
@@ -1225,8 +1292,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-
LoadSDNode *Load = cast<LoadSDNode>(Src);
+
+ unsigned AS = Load->getAddressSpace();
+ unsigned Align = Load->getAlignment();
+ Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
+ unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+
+ // Don't try to replace the load if we have to expand it due to alignment
+ // problems. Otherwise we will end up scalarizing the load, and trying to
+ // repack into the vector for no real reason.
+ if (Align < ABIAlignment &&
+ !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
+ return SDValue();
+ }
+
SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
Load->getChain(),
Load->getBasePtr(),
@@ -1297,8 +1377,8 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
if (!CAdd)
return SDValue();
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
// If the resulting offset is too large, we can't fold it into the addressing
// mode offset.
@@ -1316,6 +1396,102 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
}
+SDValue SITargetLowering::performAndCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+
+ // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+ // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if (LHS.getOpcode() == ISD::SETCC &&
+ RHS.getOpcode() == ISD::SETCC) {
+ ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+
+ SDValue X = LHS.getOperand(0);
+ SDValue Y = RHS.getOperand(0);
+ if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+ return SDValue();
+
+ if (LCC == ISD::SETO) {
+ if (X != LHS.getOperand(1))
+ return SDValue();
+
+ if (RCC == ISD::SETUNE) {
+ const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+ if (!C1 || !C1->isInfinity() || C1->isNegative())
+ return SDValue();
+
+ const uint32_t Mask = SIInstrFlags::N_NORMAL |
+ SIInstrFlags::N_SUBNORMAL |
+ SIInstrFlags::N_ZERO |
+ SIInstrFlags::P_ZERO |
+ SIInstrFlags::P_SUBNORMAL |
+ SIInstrFlags::P_NORMAL;
+
+ static_assert(((~(SIInstrFlags::S_NAN |
+ SIInstrFlags::Q_NAN |
+ SIInstrFlags::N_INFINITY |
+ SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
+ "mask not equal");
+
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+ X, DAG.getConstant(Mask, MVT::i32));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+ if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+ SDValue Src = LHS.getOperand(0);
+ if (Src != RHS.getOperand(0))
+ return SDValue();
+
+ const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!CLHS || !CRHS)
+ return SDValue();
+
+ // Only 10 bits are used.
+ static const uint32_t MaxMask = 0x3ff;
+
+ uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+ Src, DAG.getConstant(NewMask, MVT::i32));
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Mask = N->getOperand(1);
+
+ // fp_class x, 0 -> false
+ if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+ if (CMask->isNullValue())
+ return DAG.getConstant(0, MVT::i1);
+ }
+
+ return SDValue();
+}
+
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
@@ -1371,33 +1547,47 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performSetCCCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = LHS.getValueType();
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Match isinf pattern
+ // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+ const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if (!CRHS)
+ return SDValue();
+
+ const APFloat &APF = CRHS->getValueAPF();
+ if (APF.isInfinity() && !APF.isNegative()) {
+ unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
+ LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
- EVT VT = N->getValueType(0);
switch (N->getOpcode()) {
- default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
- case ISD::SETCC: {
- SDValue Arg0 = N->getOperand(0);
- SDValue Arg1 = N->getOperand(1);
- SDValue CC = N->getOperand(2);
- ConstantSDNode * C = nullptr;
- ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
-
- // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
- if (VT == MVT::i1
- && Arg0.getOpcode() == ISD::SIGN_EXTEND
- && Arg0.getOperand(0).getValueType() == MVT::i1
- && (C = dyn_cast<ConstantSDNode>(Arg1))
- && C->isNullValue()
- && CCOp == ISD::SETNE) {
- return SimplifySetCC(VT, Arg0.getOperand(0),
- DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
- }
- break;
- }
+ default:
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ case ISD::SETCC:
+ return performSetCCCombine(N, DCI);
case ISD::FMAXNUM: // TODO: What about fmax_legacy?
case ISD::FMINNUM:
case AMDGPUISD::SMAX:
@@ -1442,6 +1632,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (VT != MVT::f32)
break;
+ // Only do this if we are not trying to support denormals. v_mad_f32 does
+ // not support denormals ever.
+ if (Subtarget->hasFP32Denormals())
+ break;
+
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -1452,8 +1647,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (LHS.getOpcode() == ISD::FADD) {
SDValue A = LHS.getOperand(0);
if (A == LHS.getOperand(1)) {
- const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS);
+ const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+ return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
}
}
@@ -1461,12 +1656,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (RHS.getOpcode() == ISD::FADD) {
SDValue A = RHS.getOperand(0);
if (A == RHS.getOperand(1)) {
- const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS);
+ const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+ return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
}
}
- break;
+ return SDValue();
}
case ISD::FSUB: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -1476,39 +1671,22 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
// Try to get the fneg to fold into the source modifier. This undoes generic
// DAG combines and folds them into the mad.
- if (VT == MVT::f32) {
+ //
+ // Only do this if we are not trying to support denormals. v_mad_f32 does
+ // not support denormals ever.
+ if (VT == MVT::f32 &&
+ !Subtarget->hasFP32Denormals()) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
-
- if (LHS.getOpcode() == ISD::FMUL) {
- // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
-
- SDValue A = LHS.getOperand(0);
- SDValue B = LHS.getOperand(1);
- SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
-
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
- }
-
- if (RHS.getOpcode() == ISD::FMUL) {
- // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
-
- SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
- SDValue B = RHS.getOperand(1);
- SDValue C = LHS;
-
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
- }
-
if (LHS.getOpcode() == ISD::FADD) {
// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
SDValue A = LHS.getOperand(0);
if (A == LHS.getOperand(1)) {
- const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
+ const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS);
+ return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
}
}
@@ -1517,10 +1695,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
SDValue A = RHS.getOperand(0);
if (A == RHS.getOperand(1)) {
- const SDValue NegTwo = DAG.getTargetConstantFP(-2.0, MVT::f32);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS);
+ const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32);
+ return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
}
}
+
+ return SDValue();
}
break;
@@ -1554,9 +1734,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
if (NewPtr) {
- SmallVector<SDValue, 8> NewOps;
- for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I)
- NewOps.push_back(MemNode->getOperand(I));
+ SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
@@ -1564,287 +1742,44 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
+ case ISD::AND:
+ return performAndCombine(N, DCI);
+ case ISD::OR:
+ return performOrCombine(N, DCI);
+ case AMDGPUISD::FP_CLASS:
+ return performClassCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
-/// \brief Test if RegClass is one of the VSrc classes
-static bool isVSrc(unsigned RegClass) {
- switch(RegClass) {
- default: return false;
- case AMDGPU::VSrc_32RegClassID:
- case AMDGPU::VCSrc_32RegClassID:
- case AMDGPU::VSrc_64RegClassID:
- case AMDGPU::VCSrc_64RegClassID:
- return true;
- }
-}
-
-/// \brief Test if RegClass is one of the SSrc classes
-static bool isSSrc(unsigned RegClass) {
- return AMDGPU::SSrc_32RegClassID == RegClass ||
- AMDGPU::SSrc_64RegClassID == RegClass;
-}
-
/// \brief Analyze the possible immediate value Op
///
/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
/// and the immediate value if it's a literal immediate
int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
- union {
- int32_t I;
- float F;
- } Imm;
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
- if (Node->getZExtValue() >> 32) {
- return -1;
- }
- Imm.I = Node->getSExtValue();
- } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
- if (N->getValueType(0) != MVT::f32)
- return -1;
- Imm.F = Node->getValueAPF().convertToFloat();
- } else
- return -1; // It isn't an immediate
-
- if ((Imm.I >= -16 && Imm.I <= 64) ||
- Imm.F == 0.5f || Imm.F == -0.5f ||
- Imm.F == 1.0f || Imm.F == -1.0f ||
- Imm.F == 2.0f || Imm.F == -2.0f ||
- Imm.F == 4.0f || Imm.F == -4.0f)
- return 0; // It's an inline immediate
-
- return Imm.I; // It's a literal immediate
-}
-
-/// \brief Try to fold an immediate directly into an instruction
-bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
- bool &ScalarSlotUsed) const {
-
- MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
- if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
- return false;
-
- const SDValue &Op = Mov->getOperand(0);
- int32_t Value = analyzeImmediate(Op.getNode());
- if (Value == -1) {
- // Not an immediate at all
- return false;
-
- } else if (Value == 0) {
- // Inline immediates can always be fold
- Operand = Op;
- return true;
-
- } else if (Value == Immediate) {
- // Already fold literal immediate
- Operand = Op;
- return true;
-
- } else if (!ScalarSlotUsed && !Immediate) {
- // Fold this literal immediate
- ScalarSlotUsed = true;
- Immediate = Value;
- Operand = Op;
- return true;
+ if (TII->isInlineConstant(Node->getAPIntValue()))
+ return 0;
+ uint64_t Val = Node->getZExtValue();
+ return isUInt<32>(Val) ? Val : -1;
}
- return false;
-}
+ if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+ if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
+ return 0;
-const TargetRegisterClass *SITargetLowering::getRegClassForNode(
- SelectionDAG &DAG, const SDValue &Op) const {
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
-
- if (!Op->isMachineOpcode()) {
- switch(Op->getOpcode()) {
- case ISD::CopyFromReg: {
- MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
- return MRI.getRegClass(Reg);
- }
- return TRI.getPhysRegClass(Reg);
- }
- default: return nullptr;
- }
- }
- const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
- int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
- if (OpClassID != -1) {
- return TRI.getRegClass(OpClassID);
- }
- switch(Op.getMachineOpcode()) {
- case AMDGPU::COPY_TO_REGCLASS:
- // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
- OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
-
- // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
- // class, then the register class for the value could be either a
- // VReg or and SReg. In order to get a more accurate
- if (isVSrc(OpClassID))
- return getRegClassForNode(DAG, Op.getOperand(0));
-
- return TRI.getRegClass(OpClassID);
- case AMDGPU::EXTRACT_SUBREG: {
- int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- const TargetRegisterClass *SuperClass =
- getRegClassForNode(DAG, Op.getOperand(0));
- return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
- }
- case AMDGPU::REG_SEQUENCE:
- // Operand 0 is the register class id for REG_SEQUENCE instructions.
- return TRI.getRegClass(
- cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
- default:
- return getRegClassFor(Op.getSimpleValueType());
- }
-}
+ if (Node->getValueType(0) == MVT::f32)
+ return FloatToBits(Node->getValueAPF().convertToFloat());
-/// \brief Does "Op" fit into register class "RegClass" ?
-bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
- unsigned RegClass) const {
- const TargetRegisterInfo *TRI =
- getTargetMachine().getSubtargetImpl()->getRegisterInfo();
- const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
- if (!RC) {
- return false;
+ return -1;
}
- return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
-}
-/// \returns true if \p Node's operands are different from the SDValue list
-/// \p Ops
-static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
- for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
- if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
- return true;
- }
- }
- return false;
-}
-
-/// TODO: This needs to be removed. It's current primary purpose is to fold
-/// immediates into operands when legal. The legalization parts are redundant
-/// with SIInstrInfo::legalizeOperands which is called in a post-isel hook.
-SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node,
- SelectionDAG &DAG) const {
- // Original encoding (either e32 or e64)
- int Opcode = Node->getMachineOpcode();
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
- const MCInstrDesc *Desc = &TII->get(Opcode);
-
- unsigned NumDefs = Desc->getNumDefs();
- unsigned NumOps = Desc->getNumOperands();
-
- // Commuted opcode if available
- int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
- const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
-
- assert(!DescRev || DescRev->getNumDefs() == NumDefs);
- assert(!DescRev || DescRev->getNumOperands() == NumOps);
-
- int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
- bool HaveVSrc = false, HaveSSrc = false;
-
- // First figure out what we already have in this instruction.
- for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
- i != e && Op < NumOps; ++i, ++Op) {
-
- unsigned RegClass = Desc->OpInfo[Op].RegClass;
- if (isVSrc(RegClass))
- HaveVSrc = true;
- else if (isSSrc(RegClass))
- HaveSSrc = true;
- else
- continue;
-
- int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
- if (Imm != -1 && Imm != 0) {
- // Literal immediate
- Immediate = Imm;
- }
- }
-
- // If we neither have VSrc nor SSrc, it makes no sense to continue.
- if (!HaveVSrc && !HaveSSrc)
- return Node;
-
- // No scalar allowed when we have both VSrc and SSrc
- bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
-
- // If this instruction has an implicit use of VCC, then it can't use the
- // constant bus.
- for (unsigned i = 0, e = Desc->getNumImplicitUses(); i != e; ++i) {
- if (Desc->ImplicitUses[i] == AMDGPU::VCC) {
- ScalarSlotUsed = true;
- break;
- }
- }
-
- // Second go over the operands and try to fold them
- std::vector<SDValue> Ops;
- for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
- i != e && Op < NumOps; ++i, ++Op) {
-
- const SDValue &Operand = Node->getOperand(i);
- Ops.push_back(Operand);
-
- // Already folded immediate?
- if (isa<ConstantSDNode>(Operand.getNode()) ||
- isa<ConstantFPSDNode>(Operand.getNode()))
- continue;
-
- // Is this a VSrc or SSrc operand?
- unsigned RegClass = Desc->OpInfo[Op].RegClass;
- if (isVSrc(RegClass) || isSSrc(RegClass)) {
- // Try to fold the immediates. If this ends up with multiple constant bus
- // uses, it will be legalized later.
- foldImm(Ops[i], Immediate, ScalarSlotUsed);
- continue;
- }
-
- if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
-
- unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
- assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
-
- // Test if it makes sense to swap operands
- if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
- (!fitsRegClass(DAG, Ops[1], RegClass) &&
- fitsRegClass(DAG, Ops[1], OtherRegClass))) {
-
- // Swap commutable operands
- std::swap(Ops[0], Ops[1]);
-
- Desc = DescRev;
- DescRev = nullptr;
- continue;
- }
- }
- }
-
- // Add optional chain and glue
- for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
- Ops.push_back(Node->getOperand(i));
-
- // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
- // this case a brand new node is always be created, even if the operands
- // are the same as before. So, manually check if anything has been changed.
- if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
- return Node;
- }
-
- // Create a complete new instruction
- return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ return -1;
}
/// \brief Helper function for adjustWritemask
@@ -1904,14 +1839,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Adjust the writemask in the node
std::vector<SDValue> Ops;
Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
- for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
- Ops.push_back(Node->getOperand(i));
+ Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
// If we only got one lane, replace it with a copy
// (if NewDmask has only one bit set...)
if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
- SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+ SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32);
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
SDLoc(), Users[Lane]->getValueType(0),
SDValue(Node, 0), RC);
@@ -1963,9 +1897,8 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
/// \brief Fold the instructions after selecting them.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
- Node = AdjustRegClass(Node, DAG);
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
if (TII->isMIMG(Node->getMachineOpcode()))
adjustWritemask(Node, DAG);
@@ -1975,17 +1908,17 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
legalizeTargetIndependentNode(Node, DAG);
return Node;
}
-
- return legalizeOperands(Node, DAG);
+ return Node;
}
/// \brief Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
SDNode *Node) const {
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- getTargetMachine().getSubtargetImpl()->getInstrInfo());
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
TII->legalizeOperands(MI);
if (TII->isMIMG(MI->getOpcode())) {
@@ -1998,14 +1931,13 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
const TargetRegisterClass *RC;
switch (BitsSet) {
default: return;
- case 1: RC = &AMDGPU::VReg_32RegClass; break;
+ case 1: RC = &AMDGPU::VGPR_32RegClass; break;
case 2: RC = &AMDGPU::VReg_64RegClass; break;
case 3: RC = &AMDGPU::VReg_96RegClass; break;
}
unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
MI->setDesc(TII->get(NewOpcode));
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
MRI.setRegClass(VReg, RC);
return;
}
@@ -2030,6 +1962,8 @@ static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr) const {
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
#if 1
// XXX - Workaround for moveToVALU not handling different register class
// inserts for REG_SEQUENCE.
@@ -2039,7 +1973,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32),
buildSMovImm32(DAG, DL, 0),
DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
- buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
DAG.getTargetConstant(AMDGPU::sub1, MVT::i32)
};
@@ -2063,7 +1997,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
buildSMovImm32(DAG, DL, 0),
DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
- buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
};
@@ -2110,57 +2044,14 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr) const {
- uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
0xffffffff; // Size
return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
}
-MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
- SelectionDAG &DAG) const {
-
- SDLoc DL(N);
- unsigned NewOpcode = N->getMachineOpcode();
-
- switch (N->getMachineOpcode()) {
- default: return N;
- case AMDGPU::S_LOAD_DWORD_IMM:
- NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
- // Fall-through
- case AMDGPU::S_LOAD_DWORDX2_SGPR:
- if (NewOpcode == N->getMachineOpcode()) {
- NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
- }
- // Fall-through
- case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR: {
- if (NewOpcode == N->getMachineOpcode()) {
- NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
- }
- if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
- return N;
- }
- ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
-
- const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64);
- SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0);
- MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr);
-
- SmallVector<SDValue, 8> Ops;
- Ops.push_back(SDValue(RSrc, 0));
- Ops.push_back(N->getOperand(0));
- Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32));
-
- // Copy remaining operands so we keep any chain and glue nodes that follow
- // the normal operands.
- for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I)
- Ops.push_back(N->getOperand(I));
-
- return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
- }
- }
-}
-
SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 7bf406e..92f5847 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -42,27 +42,22 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- bool foldImm(SDValue &Operand, int32_t &Immediate,
- bool &ScalarSlotUsed) const;
- const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
- const SDValue &Op) const;
- bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
- unsigned RegClass) const;
-
- SDNode *legalizeOperands(MachineSDNode *N, SelectionDAG &DAG) const;
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
- MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
- static SDValue performUCharToFloatCombine(SDNode *N,
- DAGCombinerInfo &DCI);
+ SDValue performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
DAGCombinerInfo &DCI) const;
+ SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
public:
- SITargetLowering(TargetMachine &tm);
+ SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
EVT /*VT*/) const override;
@@ -94,6 +89,7 @@ public:
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
MachineBasicBlock * BB) const override;
+ bool enableAggressiveFMAFusion(EVT VT) const override;
EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
MVT getScalarShiftAmountTy(EVT VT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 712d97d..50f20ac 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -41,6 +41,12 @@ typedef union {
} Counters;
+typedef enum {
+ OTHER,
+ SMEM,
+ VMEM
+} InstType;
+
typedef Counters RegCounters[512];
typedef std::pair<unsigned, unsigned> RegInterval;
@@ -73,6 +79,11 @@ private:
/// \brief Different export instruction types seen since last wait.
unsigned ExpInstrTypesSeen;
+ /// \brief Type of the last opcode.
+ InstType LastOpcodeType;
+
+ bool LastInstWritesM0;
+
/// \brief Get increment/decrement amount for this instruction.
Counters getHwCounts(MachineInstr &MI);
@@ -83,7 +94,8 @@ private:
RegInterval getRegInterval(MachineOperand &Op);
/// \brief Handle instructions async components
- void pushInstruction(MachineInstr &MI);
+ void pushInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I);
/// \brief Insert the actual wait instruction
bool insertWait(MachineBasicBlock &MBB,
@@ -96,6 +108,9 @@ private:
/// \brief Resolve all operand dependencies to counter requirements
Counters handleOperands(MachineInstr &MI);
+ /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+ void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
public:
SIInsertWaits(TargetMachine &tm) :
MachineFunctionPass(ID),
@@ -176,6 +191,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
if (!MI.getDesc().mayStore())
return false;
+ // Check if this operand is the value being stored.
+ // Special case for DS instructions, since the address
+ // operand comes before the value operand and it may have
+ // multiple data operands.
+
+ if (TII->isDS(MI.getOpcode())) {
+ MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
+ if (Data && Op.isIdenticalTo(*Data))
+ return true;
+
+ MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ if (Data0 && Op.isIdenticalTo(*Data0))
+ return true;
+
+ MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+ if (Data1 && Op.isIdenticalTo(*Data1))
+ return true;
+
+ return false;
+ }
+
+ // NOTE: This assumes that the value operand is before the
+ // address operand, and that there is only one value operand.
for (MachineInstr::mop_iterator I = MI.operands_begin(),
E = MI.operands_end(); I != E; ++I) {
@@ -203,10 +241,11 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
return Result;
}
-void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
// Get the hardware counter increments and sum them up
- Counters Increment = getHwCounts(MI);
+ Counters Increment = getHwCounts(*I);
unsigned Sum = 0;
for (unsigned i = 0; i < 3; ++i) {
@@ -215,17 +254,43 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) {
}
// If we don't increase anything then that's it
- if (Sum == 0)
+ if (Sum == 0) {
+ LastOpcodeType = OTHER;
return;
+ }
+
+ if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+ // or SMEM clause, respectively.
+ //
+ // The temporary workaround is to break the clauses with S_NOP.
+ //
+ // The proper solution would be to allocate registers such that all source
+ // and destination registers don't overlap, e.g. this is illegal:
+ // r0 = load r2
+ // r2 = load r0
+ if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+ (LastOpcodeType == VMEM && Increment.Named.VM)) {
+ // Insert a NOP to break the clause.
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ LastInstWritesM0 = false;
+ }
+
+ if (TII->isSMRD(I->getOpcode()))
+ LastOpcodeType = SMEM;
+ else if (Increment.Named.VM)
+ LastOpcodeType = VMEM;
+ }
// Remember which export instructions we have seen
if (Increment.Named.EXP) {
- ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+ ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
}
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- MachineOperand &Op = MI.getOperand(i);
+ MachineOperand &Op = I->getOperand(i);
if (!isOpRelevant(Op))
continue;
@@ -302,6 +367,8 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
((Counts.Named.EXP & 0x7) << 4) |
((Counts.Named.LGKM & 0x7) << 8));
+ LastOpcodeType = OTHER;
+ LastInstWritesM0 = false;
return true;
}
@@ -343,6 +410,30 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
return Result;
}
+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return;
+
+ // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+ if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+ LastInstWritesM0 = false;
+ return;
+ }
+
+ // Set whether this instruction sets M0
+ LastInstWritesM0 = false;
+
+ unsigned NumOperands = I->getNumOperands();
+ for (unsigned i = 0; i < NumOperands; i++) {
+ const MachineOperand &Op = I->getOperand(i);
+
+ if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+ LastInstWritesM0 = true;
+ }
+}
+
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
// around other non-memory instructions.
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -356,6 +447,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
WaitedOn = ZeroCounts;
LastIssued = ZeroCounts;
+ LastOpcodeType = OTHER;
+ LastInstWritesM0 = false;
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -367,8 +460,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
- Changes |= insertWait(MBB, I, handleOperands(*I));
- pushInstruction(*I);
+ // Wait for everything before a barrier.
+ if (I->getOpcode() == AMDGPU::S_BARRIER)
+ Changes |= insertWait(MBB, I, LastIssued);
+ else
+ Changes |= insertWait(MBB, I, handleOperands(*I));
+
+ pushInstruction(MBB, I);
+ handleSendMsg(MBB, I);
}
// Wait for everything at the end of the MBB
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 10e0a3f..c90c741 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,65 +17,109 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
field bits<1> VM_CNT = 0;
field bits<1> EXP_CNT = 0;
field bits<1> LGKM_CNT = 0;
- field bits<1> MIMG = 0;
- field bits<1> SMRD = 0;
+
+ field bits<1> SALU = 0;
+ field bits<1> VALU = 0;
+
+ field bits<1> SOP1 = 0;
+ field bits<1> SOP2 = 0;
+ field bits<1> SOPC = 0;
+ field bits<1> SOPK = 0;
+ field bits<1> SOPP = 0;
+
field bits<1> VOP1 = 0;
field bits<1> VOP2 = 0;
field bits<1> VOP3 = 0;
field bits<1> VOPC = 0;
- field bits<1> SALU = 0;
+
field bits<1> MUBUF = 0;
field bits<1> MTBUF = 0;
+ field bits<1> SMRD = 0;
+ field bits<1> DS = 0;
+ field bits<1> MIMG = 0;
field bits<1> FLAT = 0;
+ field bits<1> WQM = 0;
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
let TSFlags{2} = LGKM_CNT;
- let TSFlags{3} = MIMG;
- let TSFlags{4} = SMRD;
- let TSFlags{5} = VOP1;
- let TSFlags{6} = VOP2;
- let TSFlags{7} = VOP3;
- let TSFlags{8} = VOPC;
- let TSFlags{9} = SALU;
- let TSFlags{10} = MUBUF;
- let TSFlags{11} = MTBUF;
- let TSFlags{12} = FLAT;
+
+ let TSFlags{3} = SALU;
+ let TSFlags{4} = VALU;
+
+ let TSFlags{5} = SOP1;
+ let TSFlags{6} = SOP2;
+ let TSFlags{7} = SOPC;
+ let TSFlags{8} = SOPK;
+ let TSFlags{9} = SOPP;
+
+ let TSFlags{10} = VOP1;
+ let TSFlags{11} = VOP2;
+ let TSFlags{12} = VOP3;
+ let TSFlags{13} = VOPC;
+
+ let TSFlags{14} = MUBUF;
+ let TSFlags{15} = MTBUF;
+ let TSFlags{16} = SMRD;
+ let TSFlags{17} = DS;
+ let TSFlags{18} = MIMG;
+ let TSFlags{19} = FLAT;
+ let TSFlags{20} = WQM;
// Most instructions require adjustments after selection to satisfy
// operand requirements.
let hasPostISelHook = 1;
+ let SchedRW = [Write32Bit];
}
class Enc32 {
-
field bits<32> Inst;
int Size = 4;
}
class Enc64 {
-
field bits<64> Inst;
int Size = 8;
}
-class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+let Uses = [EXEC] in {
+
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
+
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let VALU = 1;
+}
+
+class VOPCCommon <dag ins, string asm, list<dag> pattern> :
+ VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> {
+
+ let DisableEncoding = "$dst";
+ let VOPC = 1;
+ let Size = 4;
+}
+
+class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+ VOPAnyCommon <outs, ins, asm, pattern> {
+
let VOP1 = 1;
+ let Size = 4;
+}
+
+class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
+ VOPAnyCommon <outs, ins, asm, pattern> {
+
+ let VOP2 = 1;
+ let Size = 4;
}
class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> {
+ VOPAnyCommon <outs, ins, asm, pattern> {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
// Using complex patterns gives VOP3 patterns a very high complexity rating,
// but standalone patterns are almost always prefered, so we need to adjust the
// priority lower. The goal is to use a high number to reduce complexity to
@@ -83,63 +127,58 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
let AddedComplexity = -1000;
let VOP3 = 1;
-
int Size = 8;
- let Uses = [EXEC];
}
+} // End Uses = [EXEC]
+
//===----------------------------------------------------------------------===//
// Scalar operations
//===----------------------------------------------------------------------===//
class SOP1e <bits<8> op> : Enc32 {
+ bits<7> sdst;
+ bits<8> ssrc0;
- bits<7> SDST;
- bits<8> SSRC0;
-
- let Inst{7-0} = SSRC0;
+ let Inst{7-0} = ssrc0;
let Inst{15-8} = op;
- let Inst{22-16} = SDST;
+ let Inst{22-16} = sdst;
let Inst{31-23} = 0x17d; //encoding;
}
class SOP2e <bits<7> op> : Enc32 {
+ bits<7> sdst;
+ bits<8> ssrc0;
+ bits<8> ssrc1;
- bits<7> SDST;
- bits<8> SSRC0;
- bits<8> SSRC1;
-
- let Inst{7-0} = SSRC0;
- let Inst{15-8} = SSRC1;
- let Inst{22-16} = SDST;
+ let Inst{7-0} = ssrc0;
+ let Inst{15-8} = ssrc1;
+ let Inst{22-16} = sdst;
let Inst{29-23} = op;
let Inst{31-30} = 0x2; // encoding
}
class SOPCe <bits<7> op> : Enc32 {
+ bits<8> ssrc0;
+ bits<8> ssrc1;
- bits<8> SSRC0;
- bits<8> SSRC1;
-
- let Inst{7-0} = SSRC0;
- let Inst{15-8} = SSRC1;
+ let Inst{7-0} = ssrc0;
+ let Inst{15-8} = ssrc1;
let Inst{22-16} = op;
let Inst{31-23} = 0x17e;
}
class SOPKe <bits<5> op> : Enc32 {
+ bits <7> sdst;
+ bits <16> simm16;
- bits <7> SDST;
- bits <16> SIMM16;
-
- let Inst{15-0} = SIMM16;
- let Inst{22-16} = SDST;
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = sdst;
let Inst{27-23} = op;
let Inst{31-28} = 0xb; //encoding
}
class SOPPe <bits<7> op> : Enc32 {
-
bits <16> simm16;
let Inst{15-0} = simm16;
@@ -148,35 +187,36 @@ class SOPPe <bits<7> op> : Enc32 {
}
class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
+ bits<7> sdst;
+ bits<7> sbase;
+ bits<8> offset;
- bits<7> SDST;
- bits<7> SBASE;
- bits<8> OFFSET;
-
- let Inst{7-0} = OFFSET;
+ let Inst{7-0} = offset;
let Inst{8} = imm;
- let Inst{14-9} = SBASE{6-1};
- let Inst{21-15} = SDST;
+ let Inst{14-9} = sbase{6-1};
+ let Inst{21-15} = sdst;
let Inst{26-22} = op;
let Inst{31-27} = 0x18; //encoding
}
-class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, SOP1e <op> {
-
+let SchedRW = [WriteSALU] in {
+class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOP1 = 1;
}
-class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, SOP2e<op> {
+class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOP2 = 1;
let UseNamedOperandTable = 1;
}
@@ -189,17 +229,19 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOPC = 1;
let UseNamedOperandTable = 1;
}
-class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins , asm, pattern>, SOPKe<op> {
+class SOPK <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins , asm, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOPK = 1;
let UseNamedOperandTable = 1;
}
@@ -210,12 +252,14 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let isCodeGenOnly = 0;
let SALU = 1;
+ let SOPP = 1;
let UseNamedOperandTable = 1;
}
+} // let SchedRW = [WriteSALU]
+
class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern> {
@@ -225,6 +269,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
let mayLoad = 1;
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteSMEM];
}
//===----------------------------------------------------------------------===//
@@ -232,32 +277,44 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
//===----------------------------------------------------------------------===//
class VOP1e <bits<8> op> : Enc32 {
+ bits<8> vdst;
+ bits<9> src0;
- bits<8> VDST;
- bits<9> SRC0;
-
- let Inst{8-0} = SRC0;
+ let Inst{8-0} = src0;
let Inst{16-9} = op;
- let Inst{24-17} = VDST;
+ let Inst{24-17} = vdst;
let Inst{31-25} = 0x3f; //encoding
}
class VOP2e <bits<6> op> : Enc32 {
+ bits<8> vdst;
+ bits<9> src0;
+ bits<8> src1;
- bits<8> VDST;
- bits<9> SRC0;
- bits<8> VSRC1;
-
- let Inst{8-0} = SRC0;
- let Inst{16-9} = VSRC1;
- let Inst{24-17} = VDST;
+ let Inst{8-0} = src0;
+ let Inst{16-9} = src1;
+ let Inst{24-17} = vdst;
let Inst{30-25} = op;
let Inst{31} = 0x0; //encoding
}
-class VOP3e <bits<9> op> : Enc64 {
+class VOP2_MADKe <bits<6> op> : Enc64 {
+
+ bits<8> vdst;
+ bits<9> src0;
+ bits<8> vsrc1;
+ bits<32> src2;
- bits<8> dst;
+ let Inst{8-0} = src0;
+ let Inst{16-9} = vsrc1;
+ let Inst{24-17} = vdst;
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; // encoding
+ let Inst{63-32} = src2;
+}
+
+class VOP3e <bits<9> op> : Enc64 {
+ bits<8> vdst;
bits<2> src0_modifiers;
bits<9> src0;
bits<2> src1_modifiers;
@@ -267,7 +324,7 @@ class VOP3e <bits<9> op> : Enc64 {
bits<1> clamp;
bits<2> omod;
- let Inst{7-0} = dst;
+ let Inst{7-0} = vdst;
let Inst{8} = src0_modifiers{1};
let Inst{9} = src1_modifiers{1};
let Inst{10} = src2_modifiers{1};
@@ -284,8 +341,7 @@ class VOP3e <bits<9> op> : Enc64 {
}
class VOP3be <bits<9> op> : Enc64 {
-
- bits<8> dst;
+ bits<8> vdst;
bits<2> src0_modifiers;
bits<9> src0;
bits<2> src1_modifiers;
@@ -295,7 +351,7 @@ class VOP3be <bits<9> op> : Enc64 {
bits<7> sdst;
bits<2> omod;
- let Inst{7-0} = dst;
+ let Inst{7-0} = vdst;
let Inst{14-8} = sdst;
let Inst{25-17} = op;
let Inst{31-26} = 0x34; //encoding
@@ -309,33 +365,30 @@ class VOP3be <bits<9> op> : Enc64 {
}
class VOPCe <bits<8> op> : Enc32 {
+ bits<9> src0;
+ bits<8> vsrc1;
- bits<9> SRC0;
- bits<8> VSRC1;
-
- let Inst{8-0} = SRC0;
- let Inst{16-9} = VSRC1;
+ let Inst{8-0} = src0;
+ let Inst{16-9} = vsrc1;
let Inst{24-17} = op;
let Inst{31-25} = 0x3e;
}
class VINTRPe <bits<2> op> : Enc32 {
+ bits<8> vdst;
+ bits<8> vsrc;
+ bits<2> attrchan;
+ bits<6> attr;
- bits<8> VDST;
- bits<8> VSRC;
- bits<2> ATTRCHAN;
- bits<6> ATTR;
-
- let Inst{7-0} = VSRC;
- let Inst{9-8} = ATTRCHAN;
- let Inst{15-10} = ATTR;
+ let Inst{7-0} = vsrc;
+ let Inst{9-8} = attrchan;
+ let Inst{15-10} = attr;
let Inst{17-16} = op;
- let Inst{25-18} = VDST;
+ let Inst{25-18} = vdst;
let Inst{31-26} = 0x32; // encoding
}
class DSe <bits<8> op> : Enc64 {
-
bits<8> vdst;
bits<1> gds;
bits<8> addr;
@@ -356,7 +409,6 @@ class DSe <bits<8> op> : Enc64 {
}
class MUBUFe <bits<7> op> : Enc64 {
-
bits<12> offset;
bits<1> offen;
bits<1> idxen;
@@ -387,67 +439,65 @@ class MUBUFe <bits<7> op> : Enc64 {
}
class MTBUFe <bits<3> op> : Enc64 {
+ bits<8> vdata;
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<1> addr64;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
- bits<8> VDATA;
- bits<12> OFFSET;
- bits<1> OFFEN;
- bits<1> IDXEN;
- bits<1> GLC;
- bits<1> ADDR64;
- bits<4> DFMT;
- bits<3> NFMT;
- bits<8> VADDR;
- bits<7> SRSRC;
- bits<1> SLC;
- bits<1> TFE;
- bits<8> SOFFSET;
-
- let Inst{11-0} = OFFSET;
- let Inst{12} = OFFEN;
- let Inst{13} = IDXEN;
- let Inst{14} = GLC;
- let Inst{15} = ADDR64;
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{15} = addr64;
let Inst{18-16} = op;
- let Inst{22-19} = DFMT;
- let Inst{25-23} = NFMT;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
- let Inst{39-32} = VADDR;
- let Inst{47-40} = VDATA;
- let Inst{52-48} = SRSRC{6-2};
- let Inst{54} = SLC;
- let Inst{55} = TFE;
- let Inst{63-56} = SOFFSET;
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{54} = slc;
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
}
class MIMGe <bits<7> op> : Enc64 {
-
- bits<8> VDATA;
- bits<4> DMASK;
- bits<1> UNORM;
- bits<1> GLC;
- bits<1> DA;
- bits<1> R128;
- bits<1> TFE;
- bits<1> LWE;
- bits<1> SLC;
- bits<8> VADDR;
- bits<7> SRSRC;
- bits<7> SSAMP;
-
- let Inst{11-8} = DMASK;
- let Inst{12} = UNORM;
- let Inst{13} = GLC;
- let Inst{14} = DA;
- let Inst{15} = R128;
- let Inst{16} = TFE;
- let Inst{17} = LWE;
+ bits<8> vdata;
+ bits<4> dmask;
+ bits<1> unorm;
+ bits<1> glc;
+ bits<1> da;
+ bits<1> r128;
+ bits<1> tfe;
+ bits<1> lwe;
+ bits<1> slc;
+ bits<8> vaddr;
+ bits<7> srsrc;
+ bits<7> ssamp;
+
+ let Inst{11-8} = dmask;
+ let Inst{12} = unorm;
+ let Inst{13} = glc;
+ let Inst{14} = da;
+ let Inst{15} = r128;
+ let Inst{16} = tfe;
+ let Inst{17} = lwe;
let Inst{24-18} = op;
- let Inst{25} = SLC;
+ let Inst{25} = slc;
let Inst{31-26} = 0x3c;
- let Inst{39-32} = VADDR;
- let Inst{47-40} = VDATA;
- let Inst{52-48} = SRSRC{6-2};
- let Inst{57-53} = SSAMP{6-2};
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{57-53} = ssamp{6-2};
}
class FLATe<bits<7> op> : Enc64 {
@@ -471,26 +521,26 @@ class FLATe<bits<7> op> : Enc64 {
}
class EXPe : Enc64 {
- bits<4> EN;
- bits<6> TGT;
- bits<1> COMPR;
- bits<1> DONE;
- bits<1> VM;
- bits<8> VSRC0;
- bits<8> VSRC1;
- bits<8> VSRC2;
- bits<8> VSRC3;
-
- let Inst{3-0} = EN;
- let Inst{9-4} = TGT;
- let Inst{10} = COMPR;
- let Inst{11} = DONE;
- let Inst{12} = VM;
+ bits<4> en;
+ bits<6> tgt;
+ bits<1> compr;
+ bits<1> done;
+ bits<1> vm;
+ bits<8> vsrc0;
+ bits<8> vsrc1;
+ bits<8> vsrc2;
+ bits<8> vsrc3;
+
+ let Inst{3-0} = en;
+ let Inst{9-4} = tgt;
+ let Inst{10} = compr;
+ let Inst{11} = done;
+ let Inst{12} = vm;
let Inst{31-26} = 0x3e;
- let Inst{39-32} = VSRC0;
- let Inst{47-40} = VSRC1;
- let Inst{55-48} = VSRC2;
- let Inst{63-56} = VSRC3;
+ let Inst{39-32} = vsrc0;
+ let Inst{47-40} = vsrc1;
+ let Inst{55-48} = vsrc2;
+ let Inst{63-56} = vsrc3;
}
let Uses = [EXEC] in {
@@ -500,34 +550,13 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
VOP1e<op>;
class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, VOP2e<op> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOP2 = 1;
-}
-
-class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
- VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
-
-class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
- VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
+ VOP2Common <outs, ins, asm, pattern>, VOP2e<op>;
class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
- InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> {
-
- let DisableEncoding = "$dst";
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOPC = 1;
-}
+ VOPCCommon <ins, asm, pattern>, VOPCe <op>;
-class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, VINTRPe<op> {
+class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
let mayLoad = 1;
let mayStore = 0;
let hasSideEffects = 0;
@@ -541,15 +570,18 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Uses = [EXEC] in {
-class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> , DSe<op> {
+class DS <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
let LGKM_CNT = 1;
+ let DS = 1;
let UseNamedOperandTable = 1;
+ let DisableEncoding = "$m0";
+ let SchedRW = [WriteLDS];
}
-class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, MUBUFe <op> {
+class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
let VM_CNT = 1;
let EXP_CNT = 1;
@@ -557,6 +589,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -566,8 +599,9 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let EXP_CNT = 1;
let MTBUF = 1;
- let neverHasSideEffects = 1;
+ let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -596,5 +630,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
}
-
} // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 8343362..4f1e5ad 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -28,8 +28,7 @@
using namespace llvm;
SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
- : AMDGPUInstrInfo(st),
- RI(st) { }
+ : AMDGPUInstrInfo(st), RI(st) {}
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks
@@ -326,26 +325,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Opcode;
const int16_t *SubIndices;
- if (AMDGPU::M0 == DestReg) {
- // Check if M0 isn't already set to this value
- for (MachineBasicBlock::reverse_iterator E = MBB.rend(),
- I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) {
-
- if (!I->definesRegister(AMDGPU::M0))
- continue;
-
- unsigned Opc = I->getOpcode();
- if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32)
- break;
-
- if (!I->readsRegister(SrcReg))
- break;
-
- // The copy isn't necessary
- return;
- }
- }
-
if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
@@ -353,6 +332,21 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
} else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+ if (DestReg == AMDGPU::VCC) {
+ if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ // FIXME: Hack until VReg_1 removed.
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+
+ return;
+ }
+
assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
@@ -373,8 +367,8 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opcode = AMDGPU::S_MOV_B32;
SubIndices = Sub0_15;
- } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
+ } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
@@ -428,27 +422,30 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
int NewOpc;
// Try to map original to commuted opcode
- if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1)
+ NewOpc = AMDGPU::getCommuteRev(Opcode);
+ // Check if the commuted (REV) opcode exists on the target.
+ if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
return NewOpc;
// Try to map commuted to original opcode
- if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1)
+ NewOpc = AMDGPU::getCommuteOrig(Opcode);
+ // Check if the original (non-REV) opcode exists on the target.
+ if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
return NewOpc;
return Opcode;
}
-static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
-
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const TargetMachine &TM = MF->getTarget();
-
- // FIXME: Even though it can cause problems, we need to enable
- // spilling at -O0, since the fast register allocator always
- // spills registers that are live at the end of blocks.
- return MFI->getShaderType() == ShaderType::COMPUTE &&
- TM.getOptLevel() == CodeGenOpt::None;
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+ if (DstRC->getSize() == 4) {
+ return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+ } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+ return AMDGPU::S_MOV_B64;
+ } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+ return AMDGPU::V_MOV_B64_PSEUDO;
+ }
+ return AMDGPU::COPY;
}
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -458,6 +455,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
int Opcode = -1;
@@ -473,7 +471,9 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
}
- } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+ } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
+ MFI->setHasSpilledVGPRs();
+
switch(RC->getSize() * 8) {
case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
@@ -488,12 +488,16 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
FrameInfo->setObjectAlignment(FrameIndex, 4);
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg)
- .addFrameIndex(FrameIndex);
+ .addFrameIndex(FrameIndex)
+ // Place-holder registers, these will be filled in by
+ // SIPrepareScratchRegs.
+ .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+ .addReg(AMDGPU::SGPR0, RegState::Undef);
} else {
LLVMContext &Ctx = MF->getFunction()->getContext();
Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
" spill register");
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
+ BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
.addReg(SrcReg);
}
}
@@ -504,6 +508,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
int Opcode = -1;
@@ -516,7 +521,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
}
- } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+ } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
switch(RC->getSize() * 8) {
case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
@@ -530,13 +535,17 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (Opcode != -1) {
FrameInfo->setObjectAlignment(FrameIndex, 4);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex);
+ .addFrameIndex(FrameIndex)
+ // Place-holder registers, these will be filled in by
+ // SIPrepareScratchRegs.
+ .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+ .addReg(AMDGPU::SGPR0, RegState::Undef);
+
} else {
LLVMContext &Ctx = MF->getFunction()->getContext();
Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
" restore register");
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
- .addReg(AMDGPU::VGPR0);
+ BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
}
}
@@ -548,7 +557,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
unsigned Size) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>();
+ const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
DebugLoc DL = MBB.findDebugLoc(MI);
@@ -561,7 +570,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Insert = Entry.front();
DebugLoc DL = Insert->getDebugLoc();
- TIDReg = RI.findUnusedVGPR(MF->getRegInfo());
+ TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
if (TIDReg == AMDGPU::NoRegister)
return TIDReg;
@@ -616,7 +625,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
.addImm(-1)
.addImm(0);
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
TIDReg)
.addImm(-1)
.addReg(TIDReg);
@@ -682,12 +691,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
// This is just a placeholder for register allocation.
MI->eraseFromParent();
break;
+
+ case AMDGPU::V_MOV_B64_PSEUDO: {
+ unsigned Dst = MI->getOperand(0).getReg();
+ unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+ const MachineOperand &SrcOp = MI->getOperand(1);
+ // FIXME: Will this work for 64-bit floating point immediates?
+ assert(!SrcOp.isFPImm());
+ if (SrcOp.isImm()) {
+ APInt Imm(64, SrcOp.getImm());
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addImm(Imm.getLoBits(32).getZExtValue())
+ .addReg(Dst, RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addImm(Imm.getHiBits(32).getZExtValue())
+ .addReg(Dst, RegState::Implicit);
+ } else {
+ assert(SrcOp.isReg());
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+ .addReg(Dst, RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+ .addReg(Dst, RegState::Implicit);
+ }
+ MI->eraseFromParent();
+ break;
+ }
}
return true;
}
MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
bool NewMI) const {
+
if (MI->getNumOperands() < 3)
return nullptr;
@@ -709,12 +748,13 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
// Make sure it's legal to commute operands for VOP2.
if (isVOP2(MI->getOpcode()) &&
(!isOperandLegal(MI, Src0Idx, &Src1) ||
- !isOperandLegal(MI, Src1Idx, &Src0)))
+ !isOperandLegal(MI, Src1Idx, &Src0))) {
return nullptr;
+ }
if (!Src1.isReg()) {
- // Allow commuting instructions with Imm or FPImm operands.
- if (NewMI || (!Src1.isImm() && !Src1.isFPImm()) ||
+ // Allow commuting instructions with Imm operands.
+ if (NewMI || !Src1.isImm() ||
(!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
return nullptr;
}
@@ -742,8 +782,6 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
unsigned SubReg = Src0.getSubReg();
if (Src1.isImm())
Src0.ChangeToImmediate(Src1.getImm());
- else if (Src1.isFPImm())
- Src0.ChangeToFPImmediate(Src1.getFPImm());
else
llvm_unreachable("Should only have immediates");
@@ -821,6 +859,131 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
return RC != &AMDGPU::EXECRegRegClass;
}
+static void removeModOperands(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::src0_modifiers);
+ int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::src1_modifiers);
+ int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::src2_modifiers);
+
+ MI.RemoveOperand(Src2ModIdx);
+ MI.RemoveOperand(Src1ModIdx);
+ MI.RemoveOperand(Src0ModIdx);
+}
+
+bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+ unsigned Reg, MachineRegisterInfo *MRI) const {
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return false;
+
+ unsigned Opc = UseMI->getOpcode();
+ if (Opc == AMDGPU::V_MAD_F32) {
+ // Don't fold if we are using source modifiers. The new VOP2 instructions
+ // don't have them.
+ if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
+ hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
+ hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
+ return false;
+ }
+
+ MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
+ MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
+
+ // Multiplied part is the constant: Use v_madmk_f32
+ // We should only expect these to be on src0 due to canonicalizations.
+ if (Src0->isReg() && Src0->getReg() == Reg) {
+ if (!Src1->isReg() ||
+ (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ return false;
+
+ if (!Src2->isReg() ||
+ (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
+ return false;
+
+ // We need to do some weird looking operand shuffling since the madmk
+ // operands are out of the normal expected order with the multiplied
+ // constant as the last operand.
+ //
+ // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
+ // src0 -> src2 K
+ // src1 -> src0
+ // src2 -> src1
+
+ const int64_t Imm = DefMI->getOperand(1).getImm();
+
+ // FIXME: This would be a lot easier if we could return a new instruction
+ // instead of having to modify in place.
+
+ // Remove these first since they are at the end.
+ UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+ AMDGPU::OpName::omod));
+ UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+ AMDGPU::OpName::clamp));
+
+ unsigned Src1Reg = Src1->getReg();
+ unsigned Src1SubReg = Src1->getSubReg();
+ unsigned Src2Reg = Src2->getReg();
+ unsigned Src2SubReg = Src2->getSubReg();
+ Src0->setReg(Src1Reg);
+ Src0->setSubReg(Src1SubReg);
+ Src1->setReg(Src2Reg);
+ Src1->setSubReg(Src2SubReg);
+
+ Src2->ChangeToImmediate(Imm);
+
+ removeModOperands(*UseMI);
+ UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
+
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ if (DeleteDef)
+ DefMI->eraseFromParent();
+
+ return true;
+ }
+
+ // Added part is the constant: Use v_madak_f32
+ if (Src2->isReg() && Src2->getReg() == Reg) {
+ // Not allowed to use constant bus for another operand.
+ // We can however allow an inline immediate as src0.
+ if (!Src0->isImm() &&
+ (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+ return false;
+
+ if (!Src1->isReg() ||
+ (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ return false;
+
+ const int64_t Imm = DefMI->getOperand(1).getImm();
+
+ // FIXME: This would be a lot easier if we could return a new instruction
+ // instead of having to modify in place.
+
+ // Remove these first since they are at the end.
+ UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+ AMDGPU::OpName::omod));
+ UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+ AMDGPU::OpName::clamp));
+
+ Src2->ChangeToImmediate(Imm);
+
+ // These come before src2.
+ removeModOperands(*UseMI);
+ UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
+
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ if (DeleteDef)
+ DefMI->eraseFromParent();
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool
SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
AliasAnalysis *AA) const {
@@ -915,63 +1078,24 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
return false;
}
-namespace llvm {
-namespace AMDGPU {
-// Helper function generated by tablegen. We are wrapping this with
-// an SIInstrInfo function that returns bool rather than int.
-int isDS(uint16_t Opcode);
-}
-}
-
-bool SIInstrInfo::isDS(uint16_t Opcode) const {
- return ::AMDGPU::isDS(Opcode) != -1;
-}
-
-bool SIInstrInfo::isMIMG(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::MIMG;
-}
-
-bool SIInstrInfo::isSMRD(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::SMRD;
-}
-
-bool SIInstrInfo::isMUBUF(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
-}
-
-bool SIInstrInfo::isMTBUF(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
-}
-
-bool SIInstrInfo::isFLAT(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::FLAT;
-}
-
-bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP1;
-}
-
-bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP2;
-}
-
-bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP3;
-}
-
-bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOPC;
-}
-
-bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
- return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
-}
-
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
- int32_t Val = Imm.getSExtValue();
- if (Val >= -16 && Val <= 64)
+ int64_t SVal = Imm.getSExtValue();
+ if (SVal >= -16 && SVal <= 64)
return true;
+ if (Imm.getBitWidth() == 64) {
+ uint64_t Val = Imm.getZExtValue();
+ return (DoubleToBits(0.0) == Val) ||
+ (DoubleToBits(1.0) == Val) ||
+ (DoubleToBits(-1.0) == Val) ||
+ (DoubleToBits(0.5) == Val) ||
+ (DoubleToBits(-0.5) == Val) ||
+ (DoubleToBits(2.0) == Val) ||
+ (DoubleToBits(-2.0) == Val) ||
+ (DoubleToBits(4.0) == Val) ||
+ (DoubleToBits(-4.0) == Val);
+ }
+
// The actual type of the operand does not seem to matter as long
// as the bits match one of the inline immediate values. For example:
//
@@ -980,32 +1104,38 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
//
// 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
// floating-point, so it is a legal inline immediate.
-
- return (APInt::floatToBits(0.0f) == Imm) ||
- (APInt::floatToBits(1.0f) == Imm) ||
- (APInt::floatToBits(-1.0f) == Imm) ||
- (APInt::floatToBits(0.5f) == Imm) ||
- (APInt::floatToBits(-0.5f) == Imm) ||
- (APInt::floatToBits(2.0f) == Imm) ||
- (APInt::floatToBits(-2.0f) == Imm) ||
- (APInt::floatToBits(4.0f) == Imm) ||
- (APInt::floatToBits(-4.0f) == Imm);
-}
-
-bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
- if (MO.isImm())
- return isInlineConstant(APInt(32, MO.getImm(), true));
-
- if (MO.isFPImm()) {
- APFloat FpImm = MO.getFPImm()->getValueAPF();
- return isInlineConstant(FpImm.bitcastToAPInt());
+ uint32_t Val = Imm.getZExtValue();
+
+ return (FloatToBits(0.0f) == Val) ||
+ (FloatToBits(1.0f) == Val) ||
+ (FloatToBits(-1.0f) == Val) ||
+ (FloatToBits(0.5f) == Val) ||
+ (FloatToBits(-0.5f) == Val) ||
+ (FloatToBits(2.0f) == Val) ||
+ (FloatToBits(-2.0f) == Val) ||
+ (FloatToBits(4.0f) == Val) ||
+ (FloatToBits(-4.0f) == Val);
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
+ unsigned OpSize) const {
+ if (MO.isImm()) {
+ // MachineOperand provides no way to tell the true operand size, since it
+ // only records a 64-bit value. We need to know the size to determine if a
+ // 32-bit floating point immediate bit pattern is legal for an integer
+ // immediate. It would be for any 32-bit integer operand, but would not be
+ // for a 64-bit one.
+
+ unsigned BitSize = 8 * OpSize;
+ return isInlineConstant(APInt(BitSize, MO.getImm(), true));
}
return false;
}
-bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
- return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
+ unsigned OpSize) const {
+ return MO.isImm() && !isInlineConstant(MO, OpSize);
}
static bool compareMachineOp(const MachineOperand &Op0,
@@ -1018,8 +1148,6 @@ static bool compareMachineOp(const MachineOperand &Op0,
return Op0.getReg() == Op1.getReg();
case MachineOperand::MO_Immediate:
return Op0.getImm() == Op1.getImm();
- case MachineOperand::MO_FPImmediate:
- return Op0.getFPImm() == Op1.getFPImm();
default:
llvm_unreachable("Didn't expect to be comparing these operand types");
}
@@ -1029,7 +1157,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO) const {
const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
- assert(MO.isImm() || MO.isFPImm() || MO.isTargetIndex() || MO.isFI());
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;
@@ -1037,21 +1165,26 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
if (OpInfo.RegClass < 0)
return false;
- if (isLiteralConstant(MO))
- return RI.regClassCanUseLiteralConstant(OpInfo.RegClass);
+ unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
+ if (isLiteralConstant(MO, OpSize))
+ return RI.opCanUseLiteralConstant(OpInfo.OperandType);
- return RI.regClassCanUseInlineConstant(OpInfo.RegClass);
+ return RI.opCanUseInlineConstant(OpInfo.OperandType);
}
-bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) {
+bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const {
switch (AS) {
case AMDGPUAS::GLOBAL_ADDRESS: {
// MUBUF instructions a 12-bit offset in bytes.
return isUInt<12>(OffsetSize);
}
case AMDGPUAS::CONSTANT_ADDRESS: {
- // SMRD instructions have an 8-bit offset in dwords.
- return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+ // SMRD instructions have an 8-bit offset in dwords on SI and
+ // a 20-bit offset in bytes on VI.
+ if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return isUInt<20>(OffsetSize);
+ else
+ return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
}
case AMDGPUAS::LOCAL_ADDRESS:
case AMDGPUAS::REGION_ADDRESS: {
@@ -1066,7 +1199,11 @@ bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) {
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
- return AMDGPU::getVOPe32(Opcode) != -1;
+ int Op32 = AMDGPU::getVOPe32(Opcode);
+ if (Op32 == -1)
+ return false;
+
+ return pseudoToMCOpcode(Op32) != -1;
}
bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
@@ -1084,9 +1221,10 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
}
bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
- const MachineOperand &MO) const {
+ const MachineOperand &MO,
+ unsigned OpSize) const {
// Literal constants use the constant bus.
- if (isLiteralConstant(MO))
+ if (isLiteralConstant(MO, OpSize))
return true;
if (!MO.isReg() || !MO.isUse())
@@ -1132,21 +1270,35 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Make sure the register classes are correct
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+ if (MI->getOperand(i).isFPImm()) {
+ ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
+ "all fp values to integers.";
+ return false;
+ }
+
+ int RegClass = Desc.OpInfo[i].RegClass;
+
switch (Desc.OpInfo[i].OperandType) {
- case MCOI::OPERAND_REGISTER: {
- if ((MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) &&
- !isImmOperandLegal(MI, i, MI->getOperand(i))) {
- ErrInfo = "Illegal immediate value for operand.";
- return false;
- }
+ case MCOI::OPERAND_REGISTER:
+ if (MI->getOperand(i).isImm()) {
+ ErrInfo = "Illegal immediate value for operand.";
+ return false;
+ }
+ break;
+ case AMDGPU::OPERAND_REG_IMM32:
+ break;
+ case AMDGPU::OPERAND_REG_INLINE_C:
+ if (isLiteralConstant(MI->getOperand(i),
+ RI.getRegClass(RegClass)->getSize())) {
+ ErrInfo = "Illegal immediate value for operand.";
+ return false;
}
break;
case MCOI::OPERAND_IMMEDIATE:
// Check if this operand is an immediate.
// FrameIndex operands will be replaced by immediates, so they are
// allowed.
- if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
- !MI->getOperand(i).isFI()) {
+ if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
ErrInfo = "Expected immediate, but got non-immediate";
return false;
}
@@ -1158,7 +1310,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
if (!MI->getOperand(i).isReg())
continue;
- int RegClass = Desc.OpInfo[i].RegClass;
if (RegClass != -1) {
unsigned Reg = MI->getOperand(i).getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg))
@@ -1175,11 +1326,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Verify VOP*
if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+ // Only look at the true operands. Only a real operand can use the constant
+ // bus, and we don't want to check pseudo-operands like the source modifier
+ // flags.
+ const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
unsigned ConstantBusCount = 0;
unsigned SGPRUsed = AMDGPU::NoRegister;
- for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
- if (usesConstantBus(MRI, MO)) {
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1)
+ break;
+ const MachineOperand &MO = MI->getOperand(OpIdx);
+ if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
if (MO.isReg()) {
if (MO.getReg() != SGPRUsed)
++ConstantBusCount;
@@ -1195,31 +1353,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
}
}
- // Verify SRC1 for VOP2 and VOPC
- if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
- const MachineOperand &Src1 = MI->getOperand(Src1Idx);
- if (Src1.isImm() || Src1.isFPImm()) {
- ErrInfo = "VOP[2C] src1 cannot be an immediate.";
- return false;
- }
- }
-
- // Verify VOP3
- if (isVOP3(Opcode)) {
- if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
- ErrInfo = "VOP3 src0 cannot be a literal constant.";
- return false;
- }
- if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
- ErrInfo = "VOP3 src1 cannot be a literal constant.";
- return false;
- }
- if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
- ErrInfo = "VOP3 src2 cannot be a literal constant.";
- return false;
- }
- }
-
// Verify misc. restrictions on specific instructions.
if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
@@ -1287,7 +1420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
- case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+ case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
}
@@ -1302,8 +1435,13 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
const MCInstrDesc &Desc = get(MI.getOpcode());
if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
- Desc.OpInfo[OpNo].RegClass == -1)
- return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+ Desc.OpInfo[OpNo].RegClass == -1) {
+ unsigned Reg = MI.getOperand(OpNo).getReg();
+
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI.getRegClass(Reg);
+ return RI.getPhysRegClass(Reg);
+ }
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
return RI.getRegClass(RCID);
@@ -1339,7 +1477,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
VRC = &AMDGPU::VReg_64RegClass;
else
- VRC = &AMDGPU::VReg_32RegClass;
+ VRC = &AMDGPU::VGPR_32RegClass;
unsigned Reg = MRI.createVirtualRegister(VRC);
DebugLoc DL = MBB->findDebugLoc(I);
@@ -1428,6 +1566,14 @@ unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
return Dst;
}
+// Change the order of operands from (0, 1, 2) to (0, 2, 1)
+void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
+ assert(Inst->getNumExplicitOperands() == 3);
+ MachineOperand Op1 = Inst->getOperand(1);
+ Inst->RemoveOperand(1);
+ Inst->addOperand(Op1);
+}
+
bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1438,14 +1584,16 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
if (!MO)
MO = &MI->getOperand(OpIdx);
- if (usesConstantBus(MRI, *MO)) {
+ if (isVALU(InstDesc.Opcode) &&
+ usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
unsigned SGPRUsed =
MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
if (i == OpIdx)
continue;
- if (usesConstantBus(MRI, MI->getOperand(i)) &&
- MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) {
+ const MachineOperand &Op = MI->getOperand(i);
+ if (Op.isReg() && Op.getReg() != SGPRUsed &&
+ usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
return false;
}
}
@@ -1463,12 +1611,13 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
//
// s_sendmsg 0, s0 ; Operand defined as m0reg
// ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
}
// Handle non-register types that are treated like immediates.
- assert(MO->isImm() || MO->isFPImm() || MO->isTargetIndex() || MO->isFI());
+ assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
if (!DefinedRC) {
// This operand expects an immediate.
@@ -1537,7 +1686,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
// We can use one SGPR in each VOP3 instruction.
continue;
}
- } else if (!isLiteralConstant(MO)) {
+ } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
// If it is not a register and not a literal constant, then it must be
// an inline constant which is always legal.
continue;
@@ -1641,17 +1790,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
// SRsrcPtrLo = srsrc:sub0
unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
// SRsrcPtrHi = srsrc:sub1
unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
// Create an empty resource descriptor
unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
// Zero64 = 0
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
@@ -1661,12 +1811,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
// SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
SRsrcFormatLo)
- .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
// SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
SRsrcFormatHi)
- .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+ .addImm(RsrcDataFormat >> 32);
// NewSRsrc = {Zero64, SRsrcFormat}
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
@@ -1685,8 +1835,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
if (VAddr) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
- NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+ NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// NewVaddrLo = SRsrcPtrLo + VAddr:sub0
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
@@ -1709,9 +1859,6 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
- assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF "
- "with non-zero soffset is not implemented");
- (void)SOffset;
// Create the new instruction.
unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
@@ -1722,6 +1869,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
.addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
// This will be replaced later
// with the new value of vaddr.
+ .addOperand(*SOffset)
.addOperand(*Offset);
MI->removeFromParent();
@@ -1764,27 +1912,30 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI,
getNamedOperand(*MI, AMDGPU::OpName::offset);
const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
+ // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
+ // on VI.
if (OffOp) {
+ bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ unsigned OffScale = isVI ? 1 : 4;
// Handle the _IMM variant
- unsigned LoOffset = OffOp->getImm();
- unsigned HiOffset = LoOffset + (HalfSize / 4);
+ unsigned LoOffset = OffOp->getImm() * OffScale;
+ unsigned HiOffset = LoOffset + HalfSize;
Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
.addOperand(*SBase)
- .addImm(LoOffset);
+ .addImm(LoOffset / OffScale);
- if (!isUInt<8>(HiOffset)) {
+ if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
unsigned OffsetSGPR =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
- .addImm(HiOffset << 2); // The immediate offset is in dwords,
- // but offset in register is in bytes.
+ .addImm(HiOffset); // The offset in register is in bytes.
Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
.addOperand(*SBase)
.addReg(OffsetSGPR);
} else {
Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
.addOperand(*SBase)
- .addImm(HiOffset);
+ .addImm(HiOffset / OffScale);
}
} else {
// Handle the _SGPR variant
@@ -1849,10 +2000,13 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
ImmOffset = 0;
} else {
assert(MI->getOperand(2).isImm());
- // SMRD instructions take a dword offsets and MUBUF instructions
- // take a byte offset.
- ImmOffset = MI->getOperand(2).getImm() << 2;
+ // SMRD instructions take a dword offsets on SI and byte offset on VI
+ // and MUBUF instructions always take a byte offset.
+ ImmOffset = MI->getOperand(2).getImm();
+ if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+ ImmOffset <<= 2;
RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
if (isUInt<12>(ImmOffset)) {
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
RegOffset)
@@ -1870,13 +2024,14 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
.addImm(0);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
- .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
- .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+ .addImm(RsrcDataFormat >> 32);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
.addReg(DWord0)
.addImm(AMDGPU::sub0)
@@ -1893,6 +2048,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
}
MI->getOperand(1).setReg(SRsrc);
+ MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
const TargetRegisterClass *NewDstRC =
@@ -2001,6 +2157,43 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
continue;
}
+ case AMDGPU::S_LSHL_B32:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I32:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B32:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHL_B64:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHLREV_B64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I64:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_ASHRREV_I64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B64:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHRREV_B64;
+ swapOperands(Inst);
+ }
+ break;
+
case AMDGPU::S_BFE_U64:
case AMDGPU::S_BFM_B64:
llvm_unreachable("Moving this op to VALU not implemented");
@@ -2107,7 +2300,7 @@ unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
}
const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
- return &AMDGPU::VReg_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
}
void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -2237,7 +2430,7 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
MachineOperand &Dest = Inst->getOperand(0);
MachineOperand &Src = Inst->getOperand(1);
- const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+ const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
const TargetRegisterClass *SrcRC = Src.isReg() ?
MRI.getRegClass(Src.getReg()) :
&AMDGPU::SGPR_32RegClass;
@@ -2419,7 +2612,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
unsigned ValueReg,
unsigned Address, unsigned OffsetReg) const {
const DebugLoc &DL = MBB->findDebugLoc(I);
- unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+ unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
@@ -2437,7 +2630,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
unsigned ValueReg,
unsigned Address, unsigned OffsetReg) const {
const DebugLoc &DL = MBB->findDebugLoc(I);
- unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+ unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
@@ -2459,7 +2652,7 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
for (int Index = Begin; Index <= End; ++Index)
- Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+ Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
@@ -2485,3 +2678,11 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
return &MI.getOperand(Idx);
}
+
+uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+ uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
+ if (ST.isAmdHsaOS())
+ RsrcDataFormat |= (1ULL << 56);
+
+ return RsrcDataFormat;
+}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 3bdbc9b..12dc3f3 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -17,6 +17,7 @@
#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
#include "AMDGPUInstrInfo.h"
+#include "SIDefines.h"
#include "SIRegisterInfo.h"
namespace llvm {
@@ -44,6 +45,8 @@ private:
const TargetRegisterClass *RC,
const MachineOperand &Op) const;
+ void swapOperands(MachineBasicBlock::iterator Inst) const;
+
void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst, unsigned Opcode) const;
@@ -107,6 +110,10 @@ public:
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+ // \brief Returns an opcode that can be used to move a value to a \p DstRC
+ // register. If there is no hardware instruction that can store to \p
+ // DstRC, then AMDGPU::COPY is returned.
+ unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
unsigned commuteOpcode(unsigned Opcode) const;
MachineInstr *commuteInstruction(MachineInstr *MI,
@@ -128,27 +135,92 @@ public:
bool isMov(unsigned Opcode) const override;
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
- bool isDS(uint16_t Opcode) const;
- bool isMIMG(uint16_t Opcode) const;
- bool isSMRD(uint16_t Opcode) const;
- bool isMUBUF(uint16_t Opcode) const;
- bool isMTBUF(uint16_t Opcode) const;
- bool isFLAT(uint16_t Opcode) const;
- bool isVOP1(uint16_t Opcode) const;
- bool isVOP2(uint16_t Opcode) const;
- bool isVOP3(uint16_t Opcode) const;
- bool isVOPC(uint16_t Opcode) const;
+
+ bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+ unsigned Reg, MachineRegisterInfo *MRI) const final;
+
+ bool isSALU(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SALU;
+ }
+
+ bool isVALU(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VALU;
+ }
+
+ bool isSOP1(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOP1;
+ }
+
+ bool isSOP2(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOP2;
+ }
+
+ bool isSOPC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPC;
+ }
+
+ bool isSOPK(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPK;
+ }
+
+ bool isSOPP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPP;
+ }
+
+ bool isVOP1(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+ }
+
+ bool isVOP2(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+ }
+
+ bool isVOP3(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+ }
+
+ bool isVOPC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+ }
+
+ bool isMUBUF(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+ }
+
+ bool isMTBUF(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+ }
+
+ bool isSMRD(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+ }
+
+ bool isDS(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DS;
+ }
+
+ bool isMIMG(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+ }
+
+ bool isFLAT(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+ }
+
+ bool isWQM(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::WQM;
+ }
bool isInlineConstant(const APInt &Imm) const;
- bool isInlineConstant(const MachineOperand &MO) const;
- bool isLiteralConstant(const MachineOperand &MO) const;
+ bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
+ bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO) const;
/// \brief Return true if the given offset Size in bytes can be folded into
/// the immediate offsets of a memory instruction for the given address space.
- static bool canFoldOffset(unsigned OffsetSize, unsigned AS) LLVM_READNONE;
+ bool canFoldOffset(unsigned OffsetSize, unsigned AS) const;
/// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
/// This function will return false if you pass it a 32-bit instruction.
@@ -156,7 +228,8 @@ public:
/// \brief Returns true if this operand uses the constant bus.
bool usesConstantBus(const MachineRegisterInfo &MRI,
- const MachineOperand &MO) const;
+ const MachineOperand &MO,
+ unsigned OpSize) const;
/// \brief Return true if this instruction has any modifiers.
/// e.g. src[012]_mod, omod, clamp.
@@ -168,7 +241,6 @@ public:
bool verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const override;
- bool isSALUInstr(const MachineInstr &MI) const;
static unsigned getVALUOp(const MachineInstr &MI);
bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
@@ -179,7 +251,27 @@ public:
/// the register class of its machine operand.
/// to infer the correct register class base on the other operands.
const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
- unsigned OpNo) const;\
+ unsigned OpNo) const;
+
+ /// \brief Return the size in bytes of the operand OpNo on the given
+ // instruction opcode.
+ unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+ const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
+
+ if (OpInfo.RegClass == -1) {
+ // If this is an immediate operand, this must be a 32-bit literal.
+ assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
+ return 4;
+ }
+
+ return RI.getRegClass(OpInfo.RegClass)->getSize();
+ }
+
+ /// \brief This form should usually be preferred since it handles operands
+ /// with unknown register classes.
+ unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+ return getOpRegClass(MI, OpNo)->getSize();
+ }
/// \returns true if it is legal for the operand at index \p OpNo
/// to read a VGPR.
@@ -250,6 +342,9 @@ public:
unsigned OpName) const {
return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
}
+
+ uint64_t getDefaultRsrcDataFormat() const;
+
};
namespace AMDGPU {
@@ -258,7 +353,6 @@ namespace AMDGPU {
int getVOPe32(uint16_t Opcode);
int getCommuteRev(uint16_t Opcode);
int getCommuteOrig(uint16_t Opcode);
- int getMCOpcode(uint16_t Opcode, unsigned Gen);
int getAddr64Inst(uint16_t Opcode);
int getAtomicRetOp(uint16_t Opcode);
int getAtomicNoRetOp(uint16_t Opcode);
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 713e84e..e2747dc 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -9,35 +9,65 @@
class vop {
field bits<9> SI3;
+ field bits<10> VI3;
}
-class vopc <bits<8> si> : vop {
+class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop {
field bits<8> SI = si;
+ field bits<8> VI = vi;
- field bits<9> SI3 = {0, si{7-0}};
+ field bits<9> SI3 = {0, si{7-0}};
+ field bits<10> VI3 = {0, 0, vi{7-0}};
}
-class vop1 <bits<8> si> : vop {
- field bits<8> SI = si;
+class vop1 <bits<8> si, bits<8> vi = si> : vop {
+ field bits<8> SI = si;
+ field bits<8> VI = vi;
- field bits<9> SI3 = {1, 1, si{6-0}};
+ field bits<9> SI3 = {1, 1, si{6-0}};
+ field bits<10> VI3 = !add(0x140, vi);
}
-class vop2 <bits<6> si> : vop {
+class vop2 <bits<6> si, bits<6> vi = si> : vop {
field bits<6> SI = si;
+ field bits<6> VI = vi;
+
+ field bits<9> SI3 = {1, 0, 0, si{5-0}};
+ field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
+}
- field bits<9> SI3 = {1, 0, 0, si{5-0}};
+// Specify a VOP2 opcode for SI and VOP3 opcode for VI
+// that doesn't have VOP2 encoding on VI
+class vop23 <bits<6> si, bits<10> vi> : vop2 <si> {
+ let VI3 = vi;
}
-class vop3 <bits<9> si> : vop {
- field bits<9> SI3 = si;
+class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
+ let SI3 = si;
+ let VI3 = vi;
+}
+
+class sop1 <bits<8> si, bits<8> vi = si> {
+ field bits<8> SI = si;
+ field bits<8> VI = vi;
+}
+
+class sop2 <bits<7> si, bits<7> vi = si> {
+ field bits<7> SI = si;
+ field bits<7> VI = vi;
+}
+
+class sopk <bits<5> si, bits<5> vi = si> {
+ field bits<5> SI = si;
+ field bits<5> VI = vi;
}
// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
-// in AMDGPUMCInstLower.h
+// in AMDGPUInstrInfo.cpp
def SISubtarget {
int NONE = -1;
int SI = 0;
+ int VI = 1;
}
//===----------------------------------------------------------------------===//
@@ -131,6 +161,22 @@ def as_i32imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
}]>;
+def as_i64imm: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
+}]>;
+
def IMM8bit : PatLeaf <(imm),
[{return isUInt<8>(N->getZExtValue());}]
>;
@@ -143,6 +189,10 @@ def IMM16bit : PatLeaf <(imm),
[{return isUInt<16>(N->getZExtValue());}]
>;
+def IMM20bit : PatLeaf <(imm),
+ [{return isUInt<20>(N->getZExtValue());}]
+>;
+
def IMM32bit : PatLeaf <(imm),
[{return isUInt<32>(N->getZExtValue());}]
>;
@@ -156,13 +206,16 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
return isInlineImmediate(N);
}]>;
+class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
+ return isInlineImmediate(N);
+}]>;
+
class SGPRImm <dag frag> : PatLeaf<frag, [{
- if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
- AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
return false;
}
const SIRegisterInfo *SIRI =
- static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
U != E; ++U) {
if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
@@ -186,6 +239,7 @@ def sopp_brtarget : Operand<OtherVT> {
}
include "SIInstrFormats.td"
+include "VIInstrFormats.td"
let OperandType = "OPERAND_IMMEDIATE" in {
@@ -238,14 +292,15 @@ def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
-def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
//===----------------------------------------------------------------------===//
@@ -298,7 +353,7 @@ class SIMCInstr <string pseudo, int subtarget> {
class EXPCommon : InstSI<
(outs),
(ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
- VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+ VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
"exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
[] > {
@@ -308,60 +363,157 @@ class EXPCommon : InstSI<
multiclass EXP_m {
- let isPseudo = 1 in {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
}
def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+
+ def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
}
//===----------------------------------------------------------------------===//
// Scalar classes
//===----------------------------------------------------------------------===//
-class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
- op, (outs SReg_32:$dst), (ins SSrc_32:$src0),
- opName#" $dst, $src0", pattern
+class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ SOP1 <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> :
+ SOP1 <outs, ins, asm, []>,
+ SOP1e <op.SI>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> :
+ SOP1 <outs, ins, asm, []>,
+ SOP1e <op.VI>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+
+ def "" : SOP1_Pseudo <opName, outs, ins, pattern>;
+
+ def _si : SOP1_Real_si <op, opName, outs, ins, asm>;
+
+ def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+ op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+ opName#" $dst, $src0", pattern
>;
-class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0),
- opName#" $dst, $src0", pattern
+multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+ op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern
>;
+// no input, 64-bit output.
+multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
+ def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
+
+ def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
+ opName#" $dst"> {
+ let ssrc0 = 0;
+ }
+
+ def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
+ opName#" $dst"> {
+ let ssrc0 = 0;
+ }
+}
+
+// 64-bit input, no output
+multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> {
+ def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>;
+
+ def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0),
+ opName#" $src0"> {
+ let sdst = 0;
+ }
+
+ def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0),
+ opName#" $src0"> {
+ let sdst = 0;
+ }
+}
+
// 64-bit input, 32-bit output.
-class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
- op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
- opName#" $dst, $src0", pattern
+multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m <
+ op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern
>;
-class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
+class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
+ SOP2<outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let Size = 4;
-class SOP2_SELECT_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
- opName#" $dst, $src0, $src1 [$scc]", pattern
->;
+ // Pseudo instructions have no encodings, but adding this field here allows
+ // us to do:
+ // let sdst = xxx in {
+ // for multiclasses that include both real and pseudo instructions.
+ field bits<7> sdst = 0;
+}
-class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
+class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> :
+ SOP2<outs, ins, asm, []>,
+ SOP2e<op.SI>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
+ SOP2<outs, ins, asm, []>,
+ SOP2e<op.VI>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
+ def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
+
+ def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+ opName#" $dst, $src0, $src1 [$scc]">;
+
+ def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+ opName#" $dst, $src0, $src1 [$scc]">;
+}
+
+multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+
+ def "" : SOP2_Pseudo <opName, outs, ins, pattern>;
+
+ def _si : SOP2_Real_si <op, opName, outs, ins, asm>;
-class SOP2_64_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
+ def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>;
+
+}
+
+multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+ op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+ opName#" $dst, $src0, $src1", pattern
>;
-class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
+multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+ op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+ opName#" $dst, $src0, $src1", pattern
>;
+multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
+ op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+ opName#" $dst, $src0, $src1", pattern
+>;
-class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
string opName, PatLeaf cond> : SOPC <
op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
opName#" $dst, $src0, $src1", []>;
@@ -372,15 +524,44 @@ class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
: SOPC_Helper<op, SSrc_64, i64, opName, cond>;
-class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
- op, (outs SReg_32:$dst), (ins u16imm:$src0),
- opName#" $dst, $src0", pattern
->;
+class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ SOPK <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
-class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
- op, (outs SReg_64:$dst), (ins u16imm:$src0),
- opName#" $dst, $src0", pattern
->;
+class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> :
+ SOPK <outs, ins, asm, []>,
+ SOPKe <op.SI>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> :
+ SOPK <outs, ins, asm, []>,
+ SOPKe <op.VI>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
+ def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+ pattern>;
+
+ def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+ opName#" $dst, $src0">;
+
+ def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+ opName#" $dst, $src0">;
+}
+
+multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
+ def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
+ (ins SReg_32:$src0, u16imm:$src1), pattern>;
+
+ def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
+ (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">;
+
+ def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
+ (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">;
+}
//===----------------------------------------------------------------------===//
// SMRD classes
@@ -390,6 +571,7 @@ class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
SMRD <outs, ins, "", pattern>,
SIMCInstr<opName, SISubtarget.NONE> {
let isPseudo = 1;
+ let isCodeGenOnly = 1;
}
class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
@@ -398,6 +580,12 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
SMRDe <op, imm>,
SIMCInstr<opName, SISubtarget.SI>;
+class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
+ string asm> :
+ SMRD <outs, ins, asm, []>,
+ SMEMe_vi <op, imm>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
string asm, list<dag> pattern> {
@@ -405,6 +593,11 @@ multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+ // glc is only applicable to scalar stores, which are not yet
+ // implemented.
+ let glc = 0 in {
+ def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+ }
}
multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
@@ -444,44 +637,27 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> {
// Returns the register class to use for the destination of VOP[123C]
// instructions for the given VT.
class getVALUDstForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64);
+ RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32,
+ !if(!eq(VT.Size, 64), VReg_64,
+ SReg_64)); // else VT == i1
}
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+ RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
}
// Returns the register class to use for source 1 of VOP[12C] for the
// given VT.
class getVOPSrc1ForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64);
-}
-
-// Returns the register classes for the source arguments of a VOP[12C]
-// instruction for the given SrcVTs.
-class getInRC32 <list<ValueType> SrcVT> {
- list<RegisterClass> ret = [
- getVOPSrc0ForVT<SrcVT[0]>.ret,
- getVOPSrc1ForVT<SrcVT[1]>.ret
- ];
+ RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
}
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
-}
-
-// Returns the register classes for the source arguments of a VOP3
-// instruction for the given SrcVTs.
-class getInRC64 <list<ValueType> SrcVT> {
- list<RegisterClass> ret = [
- getVOP3SrcForVT<SrcVT[0]>.ret,
- getVOP3SrcForVT<SrcVT[1]>.ret,
- getVOP3SrcForVT<SrcVT[2]>.ret
- ];
+ RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
}
// Returns 1 if the source arguments have modifiers, 0 if they do not.
@@ -491,15 +667,15 @@ class hasModifiers<ValueType SrcVT> {
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
-class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
!if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
(ins)));
}
// Returns the input arguments for VOP3 instructions for the given SrcVT.
-class getIns64 <RegisterClass Src0RC, RegisterClass Src1RC,
- RegisterClass Src2RC, int NumSrcArgs,
+class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs,
bit HasModifiers> {
dag ret =
@@ -549,7 +725,7 @@ class getAsm32 <int NumSrcArgs> {
// Returns the assembly string for the inputs and outputs of a VOP3
// instruction.
class getAsm64 <int NumSrcArgs, bit HasModifiers> {
- string src0 = "$src0_modifiers,";
+ string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
string src1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
" $src1_modifiers,"));
@@ -570,11 +746,11 @@ class VOPProfile <list<ValueType> _ArgVT> {
field ValueType Src1VT = ArgVT[2];
field ValueType Src2VT = ArgVT[3];
field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret;
- field RegisterClass Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+ field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
- field RegisterClass Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
- field RegisterClass Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
- field RegisterClass Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+ field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+ field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
field bit HasModifiers = hasModifiers<Src0VT>.ret;
@@ -604,14 +780,31 @@ def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
let Src0RC32 = VCSrc_32;
}
+
+def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
+ let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ let Asm64 = " $dst, $src0_modifiers, $src1";
+}
+
+def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+ let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ let Asm64 = " $dst, $src0_modifiers, $src1";
+}
+
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
+ field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
+ field string Asm = " $dst, $src0, $vsrc1, $src2";
+}
def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
@@ -633,8 +826,13 @@ class AtomicNoRet <string noRetOp, bit isRet> {
class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
VOP1Common <outs, ins, "", pattern>,
- SIMCInstr<opName, SISubtarget.NONE> {
+ VOP <opName>,
+ SIMCInstr <opName#"_e32", SISubtarget.NONE> {
let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ field bits<8> vdst;
+ field bits<9> src0;
}
multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
@@ -642,32 +840,99 @@ multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
def _si : VOP1<op.SI, outs, ins, asm, []>,
- SIMCInstr <opName, SISubtarget.SI>;
+ SIMCInstr <opName#"_e32", SISubtarget.SI>;
+ def _vi : VOP1<op.VI, outs, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI>;
+}
+
+multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName> {
+ def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+
+ def _si : VOP1<op.SI, outs, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI>;
+ // No VI instruction. This class is for SI only.
+}
+
+class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+ VOP2Common <outs, ins, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, string revOp> {
+ def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+ VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _si : VOP2 <op.SI, outs, ins, opName#asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI>;
+}
+
+multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, string revOp> {
+ def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+ VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+ def _si : VOP2 <op.SI, outs, ins, opName#asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI>;
+ def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI>;
}
class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
- bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0);
+ bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0);
bits<2> omod = !if(HasModifiers, ?, 0);
bits<1> clamp = !if(HasModifiers, ?, 0);
bits<9> src1 = !if(HasSrc1, ?, 0);
bits<9> src2 = !if(HasSrc2, ?, 0);
}
+class VOP3DisableModFields <bit HasSrc0Mods,
+ bit HasSrc1Mods = 0,
+ bit HasSrc2Mods = 0,
+ bit HasOutputMods = 0> {
+ bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0);
+ bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0);
+ bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0);
+ bits<2> omod = !if(HasOutputMods, ?, 0);
+ bits<1> clamp = !if(HasOutputMods, ?, 0);
+}
+
class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
VOP3Common <outs, ins, "", pattern>,
VOP <opName>,
- SIMCInstr<opName, SISubtarget.NONE> {
+ SIMCInstr<opName#"_e64", SISubtarget.NONE> {
let isPseudo = 1;
+ let isCodeGenOnly = 1;
}
class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
- VOP3 <op, outs, ins, asm, []>,
- SIMCInstr<opName, SISubtarget.SI>;
-
-multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern,
+ VOP3Common <outs, ins, asm, []>,
+ VOP3e <op>,
+ SIMCInstr<opName#"_e64", SISubtarget.SI>;
+
+class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+ VOP3Common <outs, ins, asm, []>,
+ VOP3e_vi <op>,
+ SIMCInstr <opName#"_e64", SISubtarget.VI>;
+
+class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
+ VOP3Common <outs, ins, asm, []>,
+ VOP3be <op>,
+ SIMCInstr<opName#"_e64", SISubtarget.SI>;
+
+class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+ VOP3Common <outs, ins, asm, []>,
+ VOP3be_vi <op>,
+ SIMCInstr <opName#"_e64", SISubtarget.VI>;
+
+multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
string opName, int NumSrcArgs, bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
@@ -676,7 +941,26 @@ multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern,
VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
!if(!eq(NumSrcArgs, 2), 0, 1),
HasMods>;
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+ !if(!eq(NumSrcArgs, 2), 0, 1),
+ HasMods>;
+}
+
+// VOP3_m without source modifiers
+multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, int NumSrcArgs, bit HasMods = 1> {
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+ let src0_modifiers = 0,
+ src1_modifiers = 0,
+ src2_modifiers = 0,
+ clamp = 0,
+ omod = 0 in {
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
+ }
}
multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
@@ -686,6 +970,19 @@ multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
VOP3DisableFields<0, 0, HasMods>;
+
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<0, 0, HasMods>;
+}
+
+multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, bit HasMods = 1> {
+
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<0, 0, HasMods>;
+ // No VI instruction. This class is for SI only.
}
multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
@@ -695,12 +992,28 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
- def _si : VOP3_Real_si <op.SI3,
- outs, ins, asm, opName>,
- VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>,
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods>;
+
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods>;
+}
+
+multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, string revOp,
+ bit HasMods = 1, bit UseFullOp = 0> {
+
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+ VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
VOP3DisableFields<1, 0, HasMods>;
+
+ // No VI instruction. This class is for SI only.
}
+// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
+// option of implicit vcc use?
multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, string revOp,
bit HasMods = 1, bit UseFullOp = 0> {
@@ -711,13 +1024,27 @@ multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
// can write it into any SGPR. We currently don't use the carry out,
// so for now hardcode it to VCC as well.
let sdst = SIOperand.VCC, Defs = [VCC] in {
- def _si : VOP3b <op.SI3, outs, ins, asm, pattern>,
- VOP3DisableFields<1, 0, HasMods>,
- SIMCInstr<opName, SISubtarget.SI>,
- VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>;
+ def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods>;
+
+ def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods>;
} // End sdst = SIOperand.VCC, Defs = [VCC]
}
+multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, string revOp,
+ bit HasMods = 1, bit UseFullOp = 0> {
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+
+ def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 1, HasMods>;
+
+ def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 1, HasMods>;
+}
+
multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName,
bit HasMods, bit defExec> {
@@ -725,17 +1052,39 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 0, HasMods> {
+ VOP3DisableFields<1, 0, HasMods> {
+ let Defs = !if(defExec, [EXEC], []);
+ }
+
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
}
}
+// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers.
+multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
+ string asm, list<dag> pattern = []> {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
+ def "" : VOPAnyCommon <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE>;
+ }
+
+ def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>,
+ SIMCInstr <opName, SISubtarget.SI>;
+
+ def _vi : VOP3Common <outs, ins, asm, []>,
+ VOP3e_vi <op.VI3>,
+ VOP3DisableFields <1, 0, 0>,
+ SIMCInstr <opName, SISubtarget.VI>;
+}
+
multiclass VOP1_Helper <vop1 op, string opName, dag outs,
dag ins32, string asm32, list<dag> pat32,
dag ins64, string asm64, list<dag> pat64,
bit HasMods> {
- def _e32 : VOP1 <op.SI, outs, ins32, opName#asm32, pat32>, VOP<opName>;
+ defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>;
}
@@ -752,17 +1101,24 @@ multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
P.HasModifiers
>;
-class VOP2_e32 <bits<6> op, string opName, dag outs, dag ins, string asm,
- list<dag> pattern, string revOp> :
- VOP2 <op, outs, ins, opName#asm, pattern>,
- VOP <opName>,
- VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+
+ defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+
+ defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+ i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+ opName, P.HasModifiers>;
+}
multiclass VOP2_Helper <vop2 op, string opName, dag outs,
dag ins32, string asm32, list<dag> pat32,
dag ins64, string asm64, list<dag> pat64,
string revOp, bit HasMods> {
- def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>;
+ defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
defm _e64 : VOP3_2_m <op,
outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
@@ -784,12 +1140,27 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
revOp, P.HasModifiers
>;
+multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName> {
+ defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+ defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#"_e64"#P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+ opName, revOp, P.HasModifiers>;
+}
+
multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
dag ins32, string asm32, list<dag> pat32,
dag ins64, string asm64, list<dag> pat64,
string revOp, bit HasMods> {
- def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>;
+ defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
defm _e64 : VOP3b_2_m <op,
outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
@@ -811,16 +1182,94 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
revOp, P.HasModifiers
>;
+// A VOP2 instruction that is VOP3-only on VI.
+multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
+ dag ins32, string asm32, list<dag> pat32,
+ dag ins64, string asm64, list<dag> pat64,
+ string revOp, bit HasMods> {
+ defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+
+ defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName,
+ revOp, HasMods>;
+}
+
+multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName>
+ : VOP2_VI3_Helper <
+ op, opName, P.Outs,
+ P.Ins32, P.Asm32, [],
+ P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+ revOp, P.HasModifiers
+>;
+
+multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
+
+ def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>;
+
+let isCodeGenOnly = 0 in {
+ def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
+ !strconcat(opName, VOP_MADK.Asm), []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI>,
+ VOP2_MADKe <op.SI>;
+
+ def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins,
+ !strconcat(opName, VOP_MADK.Asm), []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI>,
+ VOP2_MADKe <op.VI>;
+} // End isCodeGenOnly = 0
+}
+
+class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+ VOPCCommon <ins, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, bit DefExec> {
+ def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
+
+ def _si : VOPC<op.SI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI> {
+ let Defs = !if(DefExec, [EXEC], []);
+ }
+
+ def _vi : VOPC<op.VI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI> {
+ let Defs = !if(DefExec, [EXEC], []);
+ }
+}
+
multiclass VOPC_Helper <vopc op, string opName,
dag ins32, string asm32, list<dag> pat32,
dag out64, dag ins64, string asm64, list<dag> pat64,
bit HasMods, bit DefExec> {
- def _e32 : VOPC <op.SI, ins32, opName#asm32, pat32>, VOP <opName> {
- let Defs = !if(DefExec, [EXEC], []);
- }
+ defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+
+ defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64,
+ opName, HasMods, DefExec>;
+}
+
+// Special case for class instructions which only have modifiers on
+// the 1st source operand.
+multiclass VOPC_Class_Helper <vopc op, string opName,
+ dag ins32, string asm32, list<dag> pat32,
+ dag out64, dag ins64, string asm64, list<dag> pat64,
+ bit HasMods, bit DefExec> {
+ defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
- defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, opName,
- HasMods, DefExec>;
+ defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64,
+ opName, HasMods, DefExec>,
+ VOP3DisableModFields<1, 0, 0>;
}
multiclass VOPCInst <vopc op, string opName,
@@ -839,6 +1288,19 @@ multiclass VOPCInst <vopc op, string opName,
P.HasModifiers, DefExec
>;
+multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
+ bit DefExec = 0> : VOPC_Class_Helper <
+ op, opName,
+ P.Ins32, P.Asm32, [],
+ (outs SReg_64:$dst), P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set i1:$dst,
+ (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
+ [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
+ P.HasModifiers, DefExec
+>;
+
+
multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
VOPCInst <op, opName, VOP_F32_F32_F32, cond>;
@@ -873,6 +1335,18 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods
>;
+multiclass VOPC_CLASS_F32 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+
multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> : VOP3_Helper <
op, opName, P.Outs, P.Ins64, P.Asm64,
@@ -901,9 +1375,31 @@ multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
P.NumSrcArgs, P.HasModifiers
>;
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterClass arc,
+// Special case for v_div_fmas_{f32|f64}, since it seems to be the
+// only VOP instruction that implicitly reads VCC.
+multiclass VOP3_VCC_Inst <vop3 op, string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag> : VOP3_Helper <
+ op, opName,
+ P.Outs,
+ (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0,
+ InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1,
+ InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2,
+ ClampMod:$clamp,
+ omod:$omod),
+ " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod",
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
+ (i1 VCC)))],
+ 3, 1
+>;
+
+multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
string opName, list<dag> pattern> :
- VOP3b_2_m <
+ VOP3b_3_m <
op, (outs vrc:$vdst, SReg_64:$sdst),
(ins InputModsNoDefault:$src0_modifiers, arc:$src0,
InputModsNoDefault:$src1_modifiers, arc:$src1,
@@ -917,7 +1413,7 @@ multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
- VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+ VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
@@ -931,124 +1427,259 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
i32:$omod)>;
//===----------------------------------------------------------------------===//
+// Interpolation opcodes
+//===----------------------------------------------------------------------===//
+
+class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ VINTRPCommon <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
+ string asm> :
+ VINTRPCommon <outs, ins, asm, []>,
+ VINTRPe <op>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
+ string asm> :
+ VINTRPCommon <outs, ins, asm, []>,
+ VINTRPe_vi <op>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm,
+ string disableEncoding = "", string constraints = "",
+ list<dag> pattern = []> {
+ let DisableEncoding = disableEncoding,
+ Constraints = constraints in {
+ def "" : VINTRP_Pseudo <opName, outs, ins, pattern>;
+
+ def _si : VINTRP_Real_si <op, opName, outs, ins, asm>;
+
+ def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
// Vector I/O classes
//===----------------------------------------------------------------------===//
-class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
- DS <op, outs, ins, asm, pat> {
+class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ DS <outs, ins, "", pattern>,
+ SIMCInstr <opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe <op>,
+ SIMCInstr <opName, SISubtarget.SI>;
+
+class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe_vi <op>,
+ SIMCInstr <opName, SISubtarget.VI>;
+
+class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe <op>,
+ SIMCInstr <opName, SISubtarget.SI> {
+
+ // Single load interpret the 2 i8imm operands as a single i16 offset.
bits<16> offset;
+ let offset0 = offset{7-0};
+ let offset1 = offset{15-8};
+}
+
+class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe_vi <op>,
+ SIMCInstr <opName, SISubtarget.VI> {
// Single load interpret the 2 i8imm operands as a single i16 offset.
+ bits<16> offset;
let offset0 = offset{7-0};
let offset1 = offset{15-8};
+}
+
+multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm,
+ list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
- let hasSideEffects = 0;
+ let data0 = 0, data1 = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_1A_Load_m <
op,
+ asm,
(outs regClass:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset),
- asm#" $vdst, $addr"#"$offset"#" [M0]",
- []> {
- let data0 = 0;
- let data1 = 0;
- let mayLoad = 1;
- let mayStore = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0),
+ asm#" $vdst, $addr"#"$offset",
+ []>;
+
+multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm,
+ list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let data0 = 0, data1 = 0 in {
+ def _si : DS_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_Load2_m <
op,
+ asm,
(outs regClass:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1),
- asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]",
- []> {
- let data0 = 0;
- let data1 = 0;
- let mayLoad = 1;
- let mayStore = 0;
- let hasSideEffects = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
+ M0Reg:$m0),
+ asm#" $vdst, $addr"#"$offset0"#"$offset1",
+ []>;
+
+multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let data1 = 0, vdst = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_1A_Store_m <
op,
+ asm,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset),
- asm#" $addr, $data0"#"$offset"#" [M0]",
- []> {
- let data1 = 0;
- let mayStore = 1;
- let mayLoad = 0;
- let vdst = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0),
+ asm#" $addr, $data0"#"$offset",
+ []>;
+
+multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let vdst = 0 in {
+ def _si : DS_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_Store_m <
op,
+ asm,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1,
- ds_offset0:$offset0, ds_offset1:$offset1),
- asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]",
- []> {
- let mayStore = 1;
- let mayLoad = 0;
- let hasSideEffects = 0;
- let vdst = 0;
-}
+ (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1,
+ ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0),
+ asm#" $addr, $data0, $data1"#"$offset0"#"$offset1",
+ []>;
// 1 address, 1 data.
-class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
- op,
- (outs rc:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
- asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>,
- AtomicNoRet<noRetOp, 1> {
+multiclass DS_1A1D_RET_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat, string noRetOp> {
+ let mayLoad = 1, mayStore = 1,
+ hasPostISelHook = 1 // Adjusted to no return version.
+ in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>,
+ AtomicNoRet<noRetOp, 1>;
+
+ let data1 = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
+}
- let data1 = 0;
- let mayStore = 1;
- let mayLoad = 1;
+multiclass DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc,
+ string noRetOp = ""> : DS_1A1D_RET_m <
+ op, asm,
+ (outs rc:$vdst),
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
+ asm#" $vdst, $addr, $data0"#"$offset", [], noRetOp>;
- let hasPostISelHook = 1; // Adjusted to no return version.
+// 1 address, 2 data.
+multiclass DS_1A2D_RET_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat, string noRetOp> {
+ let mayLoad = 1, mayStore = 1,
+ hasPostISelHook = 1 // Adjusted to no return version.
+ in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>,
+ AtomicNoRet<noRetOp, 1>;
+
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
}
-// 1 address, 2 data.
-class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
- op,
+multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
+ string noRetOp = ""> : DS_1A2D_RET_m <
+ op, asm,
(outs rc:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
- asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]",
- []>,
- AtomicNoRet<noRetOp, 1> {
- let mayStore = 1;
- let mayLoad = 1;
- let hasPostISelHook = 1; // Adjusted to no return version.
-}
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
+ asm#" $vdst, $addr, $data0, $data1"#"$offset",
+ [], noRetOp>;
// 1 address, 2 data.
-class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
- op,
- (outs),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
- asm#" $addr, $data0, $data1"#"$offset"#" [M0]",
- []>,
- AtomicNoRet<noRetOp, 0> {
- let mayStore = 1;
- let mayLoad = 1;
+multiclass DS_1A2D_NORET_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat, string noRetOp> {
+ let mayLoad = 1, mayStore = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>,
+ AtomicNoRet<noRetOp, 0>;
+
+ let vdst = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-// 1 address, 1 data.
-class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
- op,
+multiclass DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc,
+ string noRetOp = asm> : DS_1A2D_NORET_m <
+ op, asm,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
- asm#" $addr, $data0"#"$offset"#" [M0]",
- []>,
- AtomicNoRet<noRetOp, 0> {
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
+ asm#" $addr, $data0, $data1"#"$offset",
+ [], noRetOp>;
- let data1 = 0;
- let mayStore = 1;
- let mayLoad = 1;
+// 1 address, 1 data.
+multiclass DS_1A1D_NORET_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat, string noRetOp> {
+ let mayLoad = 1, mayStore = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>,
+ AtomicNoRet<noRetOp, 0>;
+
+ let data1 = 0, vdst = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
+multiclass DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc,
+ string noRetOp = asm> : DS_1A1D_NORET_m <
+ op, asm,
+ (outs),
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
+ asm#" $addr, $data0"#"$offset",
+ [], noRetOp>;
+
//===----------------------------------------------------------------------===//
// MTBUF classes
//===----------------------------------------------------------------------===//
@@ -1057,6 +1688,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
MTBUF <outs, ins, "", pattern>,
SIMCInstr<opName, SISubtarget.NONE> {
let isPseudo = 1;
+ let isCodeGenOnly = 1;
}
class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
@@ -1065,6 +1697,11 @@ class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
MTBUFe <op>,
SIMCInstr<opName, SISubtarget.SI>;
+class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
+ MTBUF <outs, ins, asm, []>,
+ MTBUFe_vi <op>,
+ SIMCInstr <opName, SISubtarget.VI>;
+
multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
list<dag> pattern> {
@@ -1072,6 +1709,8 @@ multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
+ def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>;
+
}
let mayStore = 1, mayLoad = 0 in {
@@ -1080,8 +1719,8 @@ multiclass MTBUF_Store_Helper <bits<3> op, string opName,
RegisterClass regClass> : MTBUF_m <
op, opName, (outs),
(ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
- i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
- SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+ SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
#" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
>;
@@ -1094,43 +1733,124 @@ multiclass MTBUF_Load_Helper <bits<3> op, string opName,
RegisterClass regClass> : MTBUF_m <
op, opName, (outs regClass:$dst),
(ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
- i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+ i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+ i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
#" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
>;
} // mayLoad = 1, mayStore = 0
-class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
+class mubuf <bits<7> si, bits<7> vi = si> {
+ field bits<7> SI = si;
+ field bits<7> VI = vi;
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
bit IsAddr64 = is_addr64;
string OpName = NAME # suffix;
}
-class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
- : MUBUF <op, outs, ins, asm, pattern> {
+class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ MUBUF <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+
+ // dummy fields, so that we can use let statements around multiclasses
+ bits<1> offen;
+ bits<1> idxen;
+ bits<8> vaddr;
+ bits<1> glc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+}
+
+class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins,
+ string asm> :
+ MUBUF <outs, ins, asm, []>,
+ MUBUFe <op.SI>,
+ SIMCInstr<opName, SISubtarget.SI> {
+ let lds = 0;
+}
- let offen = 0;
- let idxen = 0;
- let addr64 = 1;
- let tfe = 0;
+class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins,
+ string asm> :
+ MUBUF <outs, ins, asm, []>,
+ MUBUFe_vi <op.VI>,
+ SIMCInstr<opName, SISubtarget.VI> {
let lds = 0;
- let soffset = 128;
}
-class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
- : MUBUF <op, outs, ins, asm, pattern> {
+multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+
+ def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+ MUBUFAddr64Table <0>;
- let offen = 0;
- let idxen = 0;
- let addr64 = 0;
- let tfe = 0;
+ let addr64 = 0 in {
+ def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+ }
+
+ def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+}
+
+multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs,
+ dag ins, string asm, list<dag> pattern> {
+
+ def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+ MUBUFAddr64Table <1>;
+
+ let addr64 = 1 in {
+ def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+ }
+
+ // There is no VI version. If the pseudo is selected, it should be lowered
+ // for VI appropriately.
+}
+
+class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+ MUBUF <outs, ins, asm, pattern>, MUBUFe <op> {
let lds = 0;
- let vaddr = 0;
}
-multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc,
+multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins,
+ string asm, list<dag> pattern, bit is_return> {
+
+ def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+ MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>,
+ AtomicNoRet<NAME#"_OFFSET", is_return>;
+
+ let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in {
+ let addr64 = 0 in {
+ def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+ }
+
+ def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>;
+ }
+}
+
+multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins,
+ string asm, list<dag> pattern, bit is_return> {
+
+ def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
+ MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>,
+ AtomicNoRet<NAME#"_ADDR64", is_return>;
+
+ let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in {
+ def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
+ }
+
+ // There is no VI version. If the pseudo is selected, it should be lowered
+ // for VI appropriately.
+}
+
+multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
ValueType vt, SDPatternOperator atomic> {
let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
@@ -1138,174 +1858,149 @@ multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc,
// No return variants
let glc = 0 in {
- def _ADDR64 : MUBUFAtomicAddr64 <
- op, (outs),
+ defm _ADDR64 : MUBUFAtomicAddr64_m <
+ op, name#"_addr64", (outs),
(ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
- mbuf_offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", []
- >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>;
+ mbuf_offset:$offset, SCSrc_32:$soffset, slc:$slc),
+ name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
+ >;
- def _OFFSET : MUBUFAtomicOffset <
- op, (outs),
+ defm _OFFSET : MUBUFAtomicOffset_m <
+ op, name#"_offset", (outs),
(ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
- SSrc_32:$soffset, slc:$slc),
- name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", []
- >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>;
+ SCSrc_32:$soffset, slc:$slc),
+ name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0
+ >;
} // glc = 0
// Variant that return values
let glc = 1, Constraints = "$vdata = $vdata_in",
DisableEncoding = "$vdata_in" in {
- def _RTN_ADDR64 : MUBUFAtomicAddr64 <
- op, (outs rc:$vdata),
+ defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
+ op, name#"_rtn_addr64", (outs rc:$vdata),
(ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
- mbuf_offset:$offset, slc:$slc),
- name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc",
+ mbuf_offset:$offset, SSrc_32:$soffset, slc:$slc),
+ name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
[(set vt:$vdata,
- (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset,
- i1:$slc), vt:$vdata_in))]
- >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>;
+ (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i1:$slc), vt:$vdata_in))], 1
+ >;
- def _RTN_OFFSET : MUBUFAtomicOffset <
- op, (outs rc:$vdata),
+ defm _RTN_OFFSET : MUBUFAtomicOffset_m <
+ op, name#"_rtn_offset", (outs rc:$vdata),
(ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset,
- SSrc_32:$soffset, slc:$slc),
+ SCSrc_32:$soffset, slc:$slc),
name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
[(set vt:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
- i1:$slc), vt:$vdata_in))]
- >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>;
+ i1:$slc), vt:$vdata_in))], 1
+ >;
} // glc = 1
} // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
}
-multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
+multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
- let lds = 0, mayLoad = 1 in {
+ let mayLoad = 1, mayStore = 0 in {
+ let offen = 0, idxen = 0, vaddr = 0 in {
+ defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata),
+ (ins SReg_128:$srsrc,
+ mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+ slc:$slc, tfe:$tfe),
+ name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+ [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
+ i32:$soffset, i16:$offset,
+ i1:$glc, i1:$slc, i1:$tfe)))]>;
+ }
- let addr64 = 0 in {
+ let offen = 1, idxen = 0 in {
+ defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+ SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+ tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+ }
+
+ let offen = 0, idxen = 1 in {
+ defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+ mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+ slc:$slc, tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+ }
- let offen = 0, idxen = 0, vaddr = 0 in {
- def _OFFSET : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc,
- mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
- slc:$slc, tfe:$tfe),
- asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
- [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
- i32:$soffset, i16:$offset,
- i1:$glc, i1:$slc, i1:$tfe)))]>,
- MUBUFAddr64Table<0>;
- }
-
- let offen = 1, idxen = 0 in {
- def _OFFEN : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_32:$vaddr,
- SSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
- tfe:$tfe),
- asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
- }
-
- let offen = 0, idxen = 1 in {
- def _IDXEN : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_32:$vaddr,
- mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
- slc:$slc, tfe:$tfe),
- asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
- }
-
- let offen = 1, idxen = 1 in {
- def _BOTHEN : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_64:$vaddr,
- SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
- asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>;
- }
+ let offen = 1, idxen = 1 in {
+ defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VReg_64:$vaddr,
+ SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
}
- let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
- def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
- asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
+ let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in {
+ defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VReg_64:$vaddr,
+ SCSrc_32:$soffset, mbuf_offset:$offset),
+ name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset",
[(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
- i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>;
+ i64:$vaddr, i32:$soffset,
+ i16:$offset)))]>;
}
}
}
-multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
ValueType store_vt, SDPatternOperator st> {
-
- let addr64 = 0, lds = 0 in {
-
- def "" : MUBUF <
- op, (outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
- mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
- tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
- "$glc"#"$slc"#"$tfe",
- []
- >;
+ let mayLoad = 0, mayStore = 1 in {
+ defm : MUBUF_m <op, name, (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset,
+ mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
+ tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
+ "$glc"#"$slc"#"$tfe", []>;
let offen = 0, idxen = 0, vaddr = 0 in {
- def _OFFSET : MUBUF <
- op, (outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
- SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
- [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc,
- i1:$tfe))]
- >, MUBUFAddr64Table<0>;
+ defm _OFFSET : MUBUF_m <op, name#"_offset",(outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
+ SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+ name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+ [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>;
} // offen = 0, idxen = 0, vaddr = 0
let offen = 1, idxen = 0 in {
- def _OFFEN : MUBUF <
- op, (outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
- mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
- name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
- "$glc"#"$slc"#"$tfe",
- []
- >;
+ defm _OFFEN : MUBUF_m <op, name#"_offen", (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset,
+ mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
+ "$glc"#"$slc"#"$tfe", []>;
} // end offen = 1, idxen = 0
- } // End addr64 = 0, lds = 0
-
- def _ADDR64 : MUBUF <
- op, (outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
- name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
- [(st store_vt:$vdata,
- (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1>
- {
-
- let mayLoad = 0;
- let mayStore = 1;
-
- // Encoding
- let offen = 0;
- let idxen = 0;
- let glc = 0;
- let addr64 = 1;
- let lds = 0;
- let slc = 0;
- let tfe = 0;
- let soffset = 128; // ZERO
- }
+ let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in {
+ defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc,
+ VReg_64:$vaddr, SCSrc_32:$soffset,
+ mbuf_offset:$offset),
+ name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset",
+ [(st store_vt:$vdata,
+ (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr,
+ i32:$soffset, i16:$offset))]>;
+ }
+ } // End mayLoad = 0, mayStore = 1
}
class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
- FLAT <op, (outs regClass:$data),
+ FLAT <op, (outs regClass:$vdst),
(ins VReg_64:$addr),
- asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> {
+ asm#" $vdst, $addr, [M0, FLAT_SCRATCH]", []> {
let glc = 0;
let slc = 0;
let tfe = 0;
+ let data = 0;
let mayLoad = 1;
}
@@ -1321,6 +2016,7 @@ class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
let glc = 0;
let slc = 0;
let tfe = 0;
+ let vdst = 0;
}
class MIMG_Mask <string op, int channels> {
@@ -1339,7 +2035,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
#" $tfe, $lwe, $slc, $vaddr, $srsrc",
[]> {
- let SSAMP = 0;
+ let ssamp = 0;
let mayLoad = 1;
let mayStore = 0;
let hasPostISelHook = 1;
@@ -1348,7 +2044,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
int channels> {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
MIMG_Mask<asm#"_V1", channels>;
def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
MIMG_Mask<asm#"_V2", channels>;
@@ -1357,7 +2053,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
}
multiclass MIMG_NoSampler <bits<7> op, string asm> {
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
@@ -1365,7 +2061,7 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> {
class MIMG_Sampler_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
- RegisterClass src_rc> : MIMG <
+ RegisterClass src_rc, int wqm> : MIMG <
op,
(outs dst_rc:$vdata),
(ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
@@ -1377,33 +2073,41 @@ class MIMG_Sampler_Helper <bits<7> op, string asm,
let mayLoad = 1;
let mayStore = 0;
let hasPostISelHook = 1;
+ let WQM = wqm;
}
multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
- int channels> {
- def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+ int channels, int wqm> {
+ def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>,
MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
+ def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>,
+ def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>,
+ def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>,
+ def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
MIMG_Mask<asm#"_V16", channels>;
}
multiclass MIMG_Sampler <bits<7> op, string asm> {
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>;
+}
+
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> {
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>;
}
class MIMG_Gather_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
- RegisterClass src_rc> : MIMG <
+ RegisterClass src_rc, int wqm> : MIMG <
op,
(outs dst_rc:$vdata),
(ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
@@ -1424,28 +2128,36 @@ class MIMG_Gather_Helper <bits<7> op, string asm,
// Therefore, disable all code which updates DMASK by setting these two:
let MIMG = 0;
let hasPostISelHook = 0;
+ let WQM = wqm;
}
multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
- int channels> {
- def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+ int channels, int wqm> {
+ def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
MIMG_Mask<asm#"_V1", channels>;
- def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
+ def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
MIMG_Mask<asm#"_V2", channels>;
- def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>,
+ def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
MIMG_Mask<asm#"_V4", channels>;
- def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>,
+ def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
MIMG_Mask<asm#"_V8", channels>;
- def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>,
+ def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
MIMG_Mask<asm#"_V16", channels>;
}
multiclass MIMG_Gather <bits<7> op, string asm> {
- defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
- defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
- defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
- defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
+ defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>;
+ defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>;
+ defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>;
+ defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>;
+}
+
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> {
+ defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>;
+ defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>;
+ defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>;
+ defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>;
}
//===----------------------------------------------------------------------===//
@@ -1496,20 +2208,12 @@ def getCommuteOrig : InstrMapping {
let ValueCols = [["1"]];
}
-def isDS : InstrMapping {
- let FilterClass = "DS";
- let RowFields = ["Inst"];
- let ColFields = ["Size"];
- let KeyCol = ["8"];
- let ValueCols = [["8"]];
-}
-
-def getMCOpcode : InstrMapping {
+def getMCOpcodeGen : InstrMapping {
let FilterClass = "SIMCInstr";
let RowFields = ["PseudoInstr"];
let ColFields = ["Subtarget"];
let KeyCol = [!cast<string>(SISubtarget.NONE)];
- let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+ let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
}
def getAddr64Inst : InstrMapping {
@@ -1539,3 +2243,5 @@ def getAtomicNoRetOp : InstrMapping {
}
include "SIInstructions.td"
+include "CIInstructions.td"
+include "VIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 90da7a9..4f72e99 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -26,11 +26,18 @@ def SendMsgImm : Operand<i32> {
let PrintMethod = "printSendMsg";
}
-def isSI : Predicate<"Subtarget.getGeneration() "
+def isGCN : Predicate<"Subtarget->getGeneration() "
">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
-
-def isCI : Predicate<"Subtarget.getGeneration() "
+def isSICI : Predicate<
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>;
+def isCI : Predicate<"Subtarget->getGeneration() "
">= AMDGPUSubtarget::SEA_ISLANDS">;
+def isVI : Predicate <
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>;
+
def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
def SWaitMatchClass : AsmOperandClass {
@@ -43,7 +50,7 @@ def WAIT_FLAG : InstFlag<"printWaitFlag"> {
let ParserMatchClass = SWaitMatchClass;
}
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isGCN in {
//===----------------------------------------------------------------------===//
// EXP Instructions
@@ -96,90 +103,99 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
//===----------------------------------------------------------------------===//
let isMoveImm = 1 in {
-def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>;
-def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>;
-def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>;
-def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>;
+ let isReMaterializable = 1 in {
+ defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
+ defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
+ } // let isRematerializeable = 1
+
+ let Uses = [SCC] in {
+ defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
+ defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>;
+ } // End Uses = [SCC]
} // End isMoveImm = 1
-def S_NOT_B32 : SOP1_32 <0x00000007, "s_not_b32",
- [(set i32:$dst, (not i32:$src0))]
->;
+let Defs = [SCC] in {
+ defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
+ [(set i32:$dst, (not i32:$src0))]
+ >;
-def S_NOT_B64 : SOP1_64 <0x00000008, "s_not_b64",
- [(set i64:$dst, (not i64:$src0))]
->;
-def S_WQM_B32 : SOP1_32 <0x00000009, "s_wqm_b32", []>;
-def S_WQM_B64 : SOP1_64 <0x0000000a, "s_wqm_b64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "s_brev_b32",
+ defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
+ [(set i64:$dst, (not i64:$src0))]
+ >;
+ defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
+ defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
+} // End Defs = [SCC]
+
+
+defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
[(set i32:$dst, (AMDGPUbrev i32:$src0))]
>;
-def S_BREV_B64 : SOP1_64 <0x0000000c, "s_brev_b64", []>;
+defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
-////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "s_bcnt0_i32_b32", []>;
-////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "s_bcnt0_i32_b64", []>;
-def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "s_bcnt1_i32_b32",
- [(set i32:$dst, (ctpop i32:$src0))]
->;
-def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "s_bcnt1_i32_b64", []>;
+let Defs = [SCC] in {
+ defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
+ defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
+ defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
+ [(set i32:$dst, (ctpop i32:$src0))]
+ >;
+ defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
+} // End Defs = [SCC]
-////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "s_ff0_i32_b32", []>;
-////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "s_ff0_i32_b64", []>;
-def S_FF1_I32_B32 : SOP1_32 <0x00000013, "s_ff1_i32_b32",
+defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
+defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
+defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
[(set i32:$dst, (cttz_zero_undef i32:$src0))]
>;
-////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "s_ff1_i32_b64", []>;
+defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
-def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "s_flbit_i32_b32",
+defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
[(set i32:$dst, (ctlz_zero_undef i32:$src0))]
>;
-//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "s_flbit_i32_b64", []>;
-def S_FLBIT_I32 : SOP1_32 <0x00000017, "s_flbit_i32", []>;
-//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "s_flbit_i32_i64", []>;
-def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "s_sext_i32_i8",
+defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
+defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>;
+defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
+defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
[(set i32:$dst, (sext_inreg i32:$src0, i8))]
>;
-def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "s_sext_i32_i16",
+defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
[(set i32:$dst, (sext_inreg i32:$src0, i16))]
>;
-////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "s_bitset0_b32", []>;
-////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "s_bitset0_b64", []>;
-////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "s_bitset1_b32", []>;
-////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "s_bitset1_b64", []>;
-def S_GETPC_B64 : SOP1 <
- 0x0000001f, (outs SReg_64:$dst), (ins), "s_getpc_b64 $dst", []
-> {
- let SSRC0 = 0;
-}
-def S_SETPC_B64 : SOP1_64 <0x00000020, "s_setpc_b64", []>;
-def S_SWAPPC_B64 : SOP1_64 <0x00000021, "s_swappc_b64", []>;
-def S_RFE_B64 : SOP1_64 <0x00000022, "s_rfe_b64", []>;
-
-let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
-
-def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "s_and_saveexec_b64", []>;
-def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "s_or_saveexec_b64", []>;
-def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "s_xor_saveexec_b64", []>;
-def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "s_andn2_saveexec_b64", []>;
-def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "s_orn2_saveexec_b64", []>;
-def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "s_nand_saveexec_b64", []>;
-def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "s_nor_saveexec_b64", []>;
-def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "s_xnor_saveexec_b64", []>;
-
-} // End hasSideEffects = 1
-
-def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "s_quadmask_b32", []>;
-def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "s_quadmask_b64", []>;
-def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "s_movrels_b32", []>;
-def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "s_movrels_b64", []>;
-def S_MOVRELD_B32 : SOP1_32 <0x00000030, "s_movreld_b32", []>;
-def S_MOVRELD_B64 : SOP1_64 <0x00000031, "s_movreld_b64", []>;
-//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "s_cbranch_join", []>;
-def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "s_mov_regrd_b32", []>;
-def S_ABS_I32 : SOP1_32 <0x00000034, "s_abs_i32", []>;
-def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>;
+defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
+defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
+defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
+defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
+defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
+defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
+defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
+defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>;
+defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
+defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
+defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
+defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
+defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
+defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
+let Defs = [SCC] in {
+ defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>;
+} // End Defs = [SCC]
+defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>;
//===----------------------------------------------------------------------===//
// SOP2 Instructions
@@ -187,119 +203,132 @@ def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>;
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "s_add_u32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "s_add_i32",
+defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
+defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
[(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
>;
} // End isCommutable = 1
-def S_SUB_U32 : SOP2_32 <0x00000001, "s_sub_u32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "s_sub_i32",
+defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
+defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
[(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
>;
let Uses = [SCC] in { // Carry in comes from SCC
let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "s_addc_u32",
+defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
[(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
} // End isCommutable = 1
-def S_SUBB_U32 : SOP2_32 <0x00000005, "s_subb_u32",
+defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
[(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
} // End Uses = [SCC]
-} // End Defs = [SCC]
-def S_MIN_I32 : SOP2_32 <0x00000006, "s_min_i32",
+defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
[(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
>;
-def S_MIN_U32 : SOP2_32 <0x00000007, "s_min_u32",
+defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
[(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
>;
-def S_MAX_I32 : SOP2_32 <0x00000008, "s_max_i32",
+defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
[(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
>;
-def S_MAX_U32 : SOP2_32 <0x00000009, "s_max_u32",
+defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
[(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
>;
+} // End Defs = [SCC]
-def S_CSELECT_B32 : SOP2_SELECT_32 <
- 0x0000000a, "s_cselect_b32",
- []
->;
+defm S_CSELECT_B32 : SOP2_SELECT_32 <sop2<0x0a>, "s_cselect_b32", []>;
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "s_cselect_b64", []>;
+let Uses = [SCC] in {
+ defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>;
+} // End Uses = [SCC]
-def S_AND_B32 : SOP2_32 <0x0000000e, "s_and_b32",
+let Defs = [SCC] in {
+defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
[(set i32:$dst, (and i32:$src0, i32:$src1))]
>;
-def S_AND_B64 : SOP2_64 <0x0000000f, "s_and_b64",
+defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
[(set i64:$dst, (and i64:$src0, i64:$src1))]
>;
-def S_OR_B32 : SOP2_32 <0x00000010, "s_or_b32",
+defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
[(set i32:$dst, (or i32:$src0, i32:$src1))]
>;
-def S_OR_B64 : SOP2_64 <0x00000011, "s_or_b64",
+defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
[(set i64:$dst, (or i64:$src0, i64:$src1))]
>;
-def S_XOR_B32 : SOP2_32 <0x00000012, "s_xor_b32",
+defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
[(set i32:$dst, (xor i32:$src0, i32:$src1))]
>;
-def S_XOR_B64 : SOP2_64 <0x00000013, "s_xor_b64",
+defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
[(set i64:$dst, (xor i64:$src0, i64:$src1))]
>;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "s_andn2_b32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "s_andn2_b64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "s_orn2_b32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "s_orn2_b64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "s_nand_b32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "s_nand_b64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "s_nor_b32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "s_nor_b64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "s_xnor_b32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "s_xnor_b64", []>;
+defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
+defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
+defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>;
+defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>;
+defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>;
+defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>;
+defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>;
+defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>;
+defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>;
+defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>;
+} // End Defs = [SCC]
// Use added complexity so these patterns are preferred to the VALU patterns.
let AddedComplexity = 1 in {
+let Defs = [SCC] in {
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "s_lshl_b32",
+defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
[(set i32:$dst, (shl i32:$src0, i32:$src1))]
>;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "s_lshl_b64",
+defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
[(set i64:$dst, (shl i64:$src0, i32:$src1))]
>;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "s_lshr_b32",
+defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
[(set i32:$dst, (srl i32:$src0, i32:$src1))]
>;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "s_lshr_b64",
+defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
[(set i64:$dst, (srl i64:$src0, i32:$src1))]
>;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "s_ashr_i32",
+defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
[(set i32:$dst, (sra i32:$src0, i32:$src1))]
>;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "s_ashr_i64",
+defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
[(set i64:$dst, (sra i64:$src0, i32:$src1))]
>;
+} // End Defs = [SCC]
-
-def S_BFM_B32 : SOP2_32 <0x00000024, "s_bfm_b32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "s_bfm_b64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "s_mul_i32",
+defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>;
+defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
+defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
[(set i32:$dst, (mul i32:$src0, i32:$src1))]
>;
} // End AddedComplexity = 1
-def S_BFE_U32 : SOP2_32 <0x00000027, "s_bfe_u32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "s_bfe_i32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "s_bfe_u64", []>;
-def S_BFE_I64 : SOP2_64_32 <0x0000002a, "s_bfe_i64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "s_cbranch_g_fork", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "s_absdiff_i32", []>;
+let Defs = [SCC] in {
+defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
+defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
+defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
+defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
+} // End Defs = [SCC]
+
+let sdst = 0 in {
+defm S_CBRANCH_G_FORK : SOP2_m <
+ sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs),
+ (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", []
+>;
+}
+
+let Defs = [SCC] in {
+defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
+} // End Defs = [SCC]
//===----------------------------------------------------------------------===//
// SOPC Instructions
@@ -328,9 +357,13 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
//===----------------------------------------------------------------------===//
let isReMaterializable = 1 in {
-def S_MOVK_I32 : SOPK_32 <0x00000000, "s_movk_i32", []>;
+defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
} // End isReMaterializable = 1
-def S_CMOVK_I32 : SOPK_32 <0x00000002, "s_cmovk_i32", []>;
+let Uses = [SCC] in {
+ defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>;
+}
+
+let isCompare = 1 in {
/*
This instruction is disabled for now until we can figure out how to teach
@@ -344,38 +377,36 @@ SCC = S_CMPK_EQ_I32 SGPR0, imm
VCC = COPY SCC
VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
-def S_CMPK_EQ_I32 : SOPK <
- 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
- "s_cmpk_eq_i32",
+defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32",
[(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
>;
*/
-let isCompare = 1, Defs = [SCC] in {
-def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "s_cmpk_lg_i32", []>;
-def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "s_cmpk_gt_i32", []>;
-def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "s_cmpk_ge_i32", []>;
-def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "s_cmpk_lt_i32", []>;
-def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "s_cmpk_le_i32", []>;
-def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "s_cmpk_eq_u32", []>;
-def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "s_cmpk_lg_u32", []>;
-def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "s_cmpk_gt_u32", []>;
-def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "s_cmpk_ge_u32", []>;
-def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "s_cmpk_lt_u32", []>;
-def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "s_cmpk_le_u32", []>;
-} // End isCompare = 1, Defs = [SCC]
-
-let Defs = [SCC], isCommutable = 1 in {
- def S_ADDK_I32 : SOPK_32 <0x0000000f, "s_addk_i32", []>;
- def S_MULK_I32 : SOPK_32 <0x00000010, "s_mulk_i32", []>;
+defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>;
+defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>;
+defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>;
+defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>;
+defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>;
+defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>;
+defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>;
+defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>;
+defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>;
+defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>;
+defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>;
+} // End isCompare = 1
+
+let isCommutable = 1 in {
+ let Defs = [SCC], isCommutable = 1 in {
+ defm S_ADDK_I32 : SOPK_32 <sopk<0x0f, 0x0e>, "s_addk_i32", []>;
+ }
+ defm S_MULK_I32 : SOPK_32 <sopk<0x10, 0x0f>, "s_mulk_i32", []>;
}
-//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "s_cbranch_i_fork", []>;
-def S_GETREG_B32 : SOPK_32 <0x00000012, "s_getreg_b32", []>;
-def S_SETREG_B32 : SOPK_32 <0x00000013, "s_setreg_b32", []>;
-def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "s_getreg_regrd_b32", []>;
-//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "s_setreg_imm32_b32", []>;
-//def EXP : EXP_ <0x00000000, "exp", []>;
+//defm S_CBRANCH_I_FORK : SOPK_ <sopk<0x11, 0x10>, "s_cbranch_i_fork", []>;
+defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
+defm S_SETREG_B32 : SOPK_32 <sopk<0x13, 0x12>, "s_setreg_b32", []>;
+defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
+//defm S_SETREG_IMM32_B32 : SOPK_32 <sopk<0x15, 0x14>, "s_setreg_imm32_b32", []>;
//===----------------------------------------------------------------------===//
// SOPP Instructions
@@ -476,82 +507,84 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
let isCompare = 1 in {
-defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0>, "v_cmp_f_f32">;
-defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1>, "v_cmp_lt_f32", COND_OLT>;
-defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2>, "v_cmp_eq_f32", COND_OEQ>;
-defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3>, "v_cmp_le_f32", COND_OLE>;
-defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4>, "v_cmp_gt_f32", COND_OGT>;
-defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5>, "v_cmp_lg_f32">;
-defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6>, "v_cmp_ge_f32", COND_OGE>;
-defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7>, "v_cmp_o_f32", COND_O>;
-defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8>, "v_cmp_u_f32", COND_UO>;
-defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9>, "v_cmp_nge_f32">;
-defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa>, "v_cmp_nlg_f32">;
-defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb>, "v_cmp_ngt_f32">;
-defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc>, "v_cmp_nle_f32">;
-defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd>, "v_cmp_neq_f32", COND_UNE>;
-defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe>, "v_cmp_nlt_f32">;
-defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf>, "v_cmp_tru_f32">;
+defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT>;
+defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>;
+defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10>, "v_cmpx_f_f32">;
-defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11>, "v_cmpx_lt_f32">;
-defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12>, "v_cmpx_eq_f32">;
-defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13>, "v_cmpx_le_f32">;
-defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14>, "v_cmpx_gt_f32">;
-defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15>, "v_cmpx_lg_f32">;
-defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16>, "v_cmpx_ge_f32">;
-defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17>, "v_cmpx_o_f32">;
-defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18>, "v_cmpx_u_f32">;
-defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19>, "v_cmpx_nge_f32">;
-defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a>, "v_cmpx_nlg_f32">;
-defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b>, "v_cmpx_ngt_f32">;
-defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c>, "v_cmpx_nle_f32">;
-defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d>, "v_cmpx_neq_f32">;
-defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e>, "v_cmpx_nlt_f32">;
-defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f>, "v_cmpx_tru_f32">;
+defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">;
} // End hasSideEffects = 1
-defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20>, "v_cmp_f_f64">;
-defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21>, "v_cmp_lt_f64", COND_OLT>;
-defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22>, "v_cmp_eq_f64", COND_OEQ>;
-defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23>, "v_cmp_le_f64", COND_OLE>;
-defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24>, "v_cmp_gt_f64", COND_OGT>;
-defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25>, "v_cmp_lg_f64">;
-defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26>, "v_cmp_ge_f64", COND_OGE>;
-defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27>, "v_cmp_o_f64", COND_O>;
-defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28>, "v_cmp_u_f64", COND_UO>;
-defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29>, "v_cmp_nge_f64">;
-defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a>, "v_cmp_nlg_f64">;
-defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b>, "v_cmp_ngt_f64">;
-defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c>, "v_cmp_nle_f64">;
-defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d>, "v_cmp_neq_f64", COND_UNE>;
-defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e>, "v_cmp_nlt_f64">;
-defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f>, "v_cmp_tru_f64">;
+defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>;
+defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>;
+defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30>, "v_cmpx_f_f64">;
-defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31>, "v_cmpx_lt_f64">;
-defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32>, "v_cmpx_eq_f64">;
-defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33>, "v_cmpx_le_f64">;
-defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34>, "v_cmpx_gt_f64">;
-defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35>, "v_cmpx_lg_f64">;
-defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36>, "v_cmpx_ge_f64">;
-defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37>, "v_cmpx_o_f64">;
-defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38>, "v_cmpx_u_f64">;
-defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39>, "v_cmpx_nge_f64">;
-defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a>, "v_cmpx_nlg_f64">;
-defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b>, "v_cmpx_ngt_f64">;
-defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c>, "v_cmpx_nle_f64">;
-defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d>, "v_cmpx_neq_f64">;
-defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e>, "v_cmpx_nlt_f64">;
-defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f>, "v_cmpx_tru_f64">;
+defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">;
} // End hasSideEffects = 1
+let SubtargetPredicate = isSICI in {
+
defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">;
defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
@@ -628,104 +661,106 @@ defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
} // End hasSideEffects = 1, Defs = [EXEC]
-defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80>, "v_cmp_f_i32">;
-defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81>, "v_cmp_lt_i32", COND_SLT>;
-defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82>, "v_cmp_eq_i32", COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83>, "v_cmp_le_i32", COND_SLE>;
-defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84>, "v_cmp_gt_i32", COND_SGT>;
-defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85>, "v_cmp_ne_i32", COND_NE>;
-defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86>, "v_cmp_ge_i32", COND_SGE>;
-defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87>, "v_cmp_t_i32">;
+} // End SubtargetPredicate = isSICI
+
+defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>;
+defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>;
+defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90>, "v_cmpx_f_i32">;
-defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91>, "v_cmpx_lt_i32">;
-defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92>, "v_cmpx_eq_i32">;
-defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93>, "v_cmpx_le_i32">;
-defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94>, "v_cmpx_gt_i32">;
-defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95>, "v_cmpx_ne_i32">;
-defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96>, "v_cmpx_ge_i32">;
-defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97>, "v_cmpx_t_i32">;
+defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">;
} // End hasSideEffects = 1
-defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0>, "v_cmp_f_i64">;
-defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1>, "v_cmp_lt_i64", COND_SLT>;
-defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2>, "v_cmp_eq_i64", COND_EQ>;
-defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3>, "v_cmp_le_i64", COND_SLE>;
-defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4>, "v_cmp_gt_i64", COND_SGT>;
-defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5>, "v_cmp_ne_i64", COND_NE>;
-defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6>, "v_cmp_ge_i64", COND_SGE>;
-defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7>, "v_cmp_t_i64">;
+defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>;
+defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0>, "v_cmpx_f_i64">;
-defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1>, "v_cmpx_lt_i64">;
-defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2>, "v_cmpx_eq_i64">;
-defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3>, "v_cmpx_le_i64">;
-defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4>, "v_cmpx_gt_i64">;
-defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5>, "v_cmpx_ne_i64">;
-defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6>, "v_cmpx_ge_i64">;
-defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7>, "v_cmpx_t_i64">;
+defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">;
} // End hasSideEffects = 1
-defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0>, "v_cmp_f_u32">;
-defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1>, "v_cmp_lt_u32", COND_ULT>;
-defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2>, "v_cmp_eq_u32", COND_EQ>;
-defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3>, "v_cmp_le_u32", COND_ULE>;
-defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4>, "v_cmp_gt_u32", COND_UGT>;
-defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5>, "v_cmp_ne_u32", COND_NE>;
-defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6>, "v_cmp_ge_u32", COND_UGE>;
-defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7>, "v_cmp_t_u32">;
+defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0>, "v_cmpx_f_u32">;
-defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1>, "v_cmpx_lt_u32">;
-defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2>, "v_cmpx_eq_u32">;
-defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3>, "v_cmpx_le_u32">;
-defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4>, "v_cmpx_gt_u32">;
-defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5>, "v_cmpx_ne_u32">;
-defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6>, "v_cmpx_ge_u32">;
-defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7>, "v_cmpx_t_u32">;
+defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">;
} // End hasSideEffects = 1
-defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0>, "v_cmp_f_u64">;
-defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1>, "v_cmp_lt_u64", COND_ULT>;
-defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2>, "v_cmp_eq_u64", COND_EQ>;
-defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3>, "v_cmp_le_u64", COND_ULE>;
-defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4>, "v_cmp_gt_u64", COND_UGT>;
-defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5>, "v_cmp_ne_u64", COND_NE>;
-defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6>, "v_cmp_ge_u64", COND_UGE>;
-defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7>, "v_cmp_t_u64">;
+defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0>, "v_cmpx_f_u64">;
-defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1>, "v_cmpx_lt_u64">;
-defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2>, "v_cmpx_eq_u64">;
-defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3>, "v_cmpx_le_u64">;
-defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4>, "v_cmpx_gt_u64">;
-defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5>, "v_cmpx_ne_u64">;
-defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6>, "v_cmpx_ge_u64">;
-defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7>, "v_cmpx_t_u64">;
+defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">;
} // End hasSideEffects = 1
-defm V_CMP_CLASS_F32 : VOPC_F32 <vopc<0x88>, "v_cmp_class_f32">;
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">;
let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F32 : VOPCX_F32 <vopc<0x98>, "v_cmpx_class_f32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">;
} // End hasSideEffects = 1
-defm V_CMP_CLASS_F64 : VOPC_F64 <vopc<0xa8>, "v_cmp_class_f64">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">;
let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
} // End hasSideEffects = 1
} // End isCompare = 1
@@ -735,88 +770,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">;
//===----------------------------------------------------------------------===//
-def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VReg_32>;
-def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VReg_32>;
-def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VReg_32>;
-def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VReg_32>;
-def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VReg_32>;
-def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VReg_32>;
-def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VReg_32>;
-def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VReg_32>;
-def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VReg_32>;
-def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VReg_32>;
-def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VReg_32>;
-def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VReg_32>;
-def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VReg_32>;
-def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VReg_32>;
-def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VReg_32>;
-def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VReg_32>;
-def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VReg_32>;
-
-def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VReg_32, "ds_add_u32">;
-def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VReg_32, "ds_sub_u32">;
-def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VReg_32, "ds_rsub_u32">;
-def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VReg_32, "ds_inc_u32">;
-def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VReg_32, "ds_dec_u32">;
-def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VReg_32, "ds_min_i32">;
-def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VReg_32, "ds_max_i32">;
-def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VReg_32, "ds_min_u32">;
-def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VReg_32, "ds_max_u32">;
-def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VReg_32, "ds_and_b32">;
-def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VReg_32, "ds_or_b32">;
-def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VReg_32, "ds_xor_b32">;
-def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VReg_32, "ds_mskor_b32">;
-def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VReg_32>;
-//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2_b32">;
-//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2st64_b32">;
-def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VReg_32, "ds_cmpst_b32">;
-def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VReg_32, "ds_cmpst_f32">;
-def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VReg_32, "ds_min_f32">;
-def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VReg_32, "ds_max_f32">;
+defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
+defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
+defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
+defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
+defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
+defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
+defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
+defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
+defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
+defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
+defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
+defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
+defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
+defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
+defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
+defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>;
+defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>;
+
+defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
+defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
+defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
+defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
+defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
+defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
+defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
+defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
+defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
+defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
+defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
+defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
let SubtargetPredicate = isCI in {
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VReg_32, "ds_wrap_f32">;
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
} // End isCI
-def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
-def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
-def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
-def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
-def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
-def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
-def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
-def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
-def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
-def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
-def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
-def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
-def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
-def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
-def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
-def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
-def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
-
-def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
-def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
-def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
-def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
-def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
-def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
-def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
-def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
-def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
-def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
-def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
-def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
-def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
-def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
+defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
+defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
+defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
+defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
+defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
+defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
+defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
+defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
+defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
+defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
+defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
+defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
+defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
+defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
+defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
+defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
+defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
+
+defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">;
//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">;
-def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
-def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
-def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">;
-def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
+defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">;
+defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">;
//let SubtargetPredicate = isCI in {
// DS_CONDXCHG32_RTN_B64
@@ -825,139 +860,140 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
// TODO: _SRC2_* forms
-def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VReg_32>;
-def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VReg_32>;
-def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VReg_32>;
-def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>;
+defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>;
+defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>;
+defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>;
+defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>;
-def DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VReg_32>;
-def DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VReg_32>;
-def DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VReg_32>;
-def DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VReg_32>;
-def DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VReg_32>;
-def DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>;
+defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>;
+defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>;
+defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>;
+defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>;
+defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>;
+defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>;
// 2 forms.
-def DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VReg_32>;
-def DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VReg_32>;
-def DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>;
-def DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>;
+defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>;
+defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>;
+defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>;
-def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
-def DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
-def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
-def DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
+defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
+defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
+defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
+defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
//===----------------------------------------------------------------------===//
// MUBUF Instructions
//===----------------------------------------------------------------------===//
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>;
-//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>;
-//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>;
-//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>;
-//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>;
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>;
defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
- 0x00000008, "buffer_load_ubyte", VReg_32, i32, az_extloadi8_global
+ mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
>;
defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
- 0x00000009, "buffer_load_sbyte", VReg_32, i32, sextloadi8_global
+ mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
>;
defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
- 0x0000000a, "buffer_load_ushort", VReg_32, i32, az_extloadi16_global
+ mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
>;
defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
- 0x0000000b, "buffer_load_sshort", VReg_32, i32, sextloadi16_global
+ mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
>;
defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
- 0x0000000c, "buffer_load_dword", VReg_32, i32, global_load
+ mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
- 0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+ mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
- 0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+ mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
>;
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
- 0x00000018, "buffer_store_byte", VReg_32, i32, truncstorei8_global
+ mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
>;
defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
- 0x0000001a, "buffer_store_short", VReg_32, i32, truncstorei16_global
+ mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
>;
defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
- 0x0000001c, "buffer_store_dword", VReg_32, i32, global_store
+ mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
- 0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store
+ mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
- 0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store
+ mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
>;
-//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>;
+
defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
- 0x00000030, "buffer_atomic_swap", VReg_32, i32, atomic_swap_global
+ mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
>;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
- 0x00000032, "buffer_atomic_add", VReg_32, i32, atomic_add_global
+ mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
>;
defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
- 0x00000033, "buffer_atomic_sub", VReg_32, i32, atomic_sub_global
+ mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
>;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI
defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
- 0x00000035, "buffer_atomic_smin", VReg_32, i32, atomic_min_global
+ mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
>;
defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
- 0x00000036, "buffer_atomic_umin", VReg_32, i32, atomic_umin_global
+ mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
>;
defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
- 0x00000037, "buffer_atomic_smax", VReg_32, i32, atomic_max_global
+ mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
>;
defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
- 0x00000038, "buffer_atomic_umax", VReg_32, i32, atomic_umax_global
+ mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
>;
defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
- 0x00000039, "buffer_atomic_and", VReg_32, i32, atomic_and_global
+ mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
>;
defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
- 0x0000003a, "buffer_atomic_or", VReg_32, i32, atomic_or_global
+ mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
>;
defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
- 0x0000003b, "buffer_atomic_xor", VReg_32, i32, atomic_xor_global
->;
-//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>;
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>;
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>;
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>;
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>;
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>;
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>;
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>;
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>;
+ mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
+//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
//===----------------------------------------------------------------------===//
// MTBUF Instructions
@@ -967,7 +1003,7 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VReg_32>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>;
defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
@@ -1004,63 +1040,63 @@ defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
-defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "image_get_lod">;
+defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
@@ -1077,25 +1113,25 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
//===----------------------------------------------------------------------===//
let Predicates = [HasFlatAddressSpace] in {
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VReg_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VReg_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VReg_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VReg_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VReg_32>;
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VGPR_32>;
def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>;
def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>;
def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>;
def FLAT_STORE_BYTE : FLAT_Store_Helper <
- 0x00000018, "flat_store_byte", VReg_32
+ 0x00000018, "flat_store_byte", VGPR_32
>;
def FLAT_STORE_SHORT : FLAT_Store_Helper <
- 0x0000001a, "flat_store_short", VReg_32
+ 0x0000001a, "flat_store_short", VGPR_32
>;
def FLAT_STORE_DWORD : FLAT_Store_Helper <
- 0x0000001c, "flat_store_dword", VReg_32
+ 0x0000001c, "flat_store_dword", VGPR_32
>;
def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
@@ -1150,7 +1186,9 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
// VOP1 Instructions
//===----------------------------------------------------------------------===//
-//def V_NOP : VOP1_ <0x00000000, "v_nop", []>;
+let vdst = 0, src0 = 0 in {
+defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+}
let isMoveImm = 1 in {
defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
@@ -1158,16 +1196,20 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
let Uses = [EXEC] in {
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+
def V_READFIRSTLANE_B32 : VOP1 <
0x00000002,
(outs SReg_32:$vdst),
- (ins VReg_32:$src0),
+ (ins VGPR_32:$src0),
"v_readfirstlane_b32 $vdst, $src0",
[]
>;
}
+let SchedRW = [WriteQuarterRate32] in {
+
defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
VOP_I32_F64, fp_to_sint
>;
@@ -1193,9 +1235,11 @@ defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
VOP_F32_I32, f16_to_fp
>;
-//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>;
-//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>;
-//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>;
+defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32",
+ VOP_I32_F32, cvt_rpi_i32_f32>;
+defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32",
+ VOP_I32_F32, cvt_flr_i32_f32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>;
defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
VOP_F32_F64, fround
>;
@@ -1221,493 +1265,580 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
VOP_F64_I32, uint_to_fp
>;
-defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32",
+} // let SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
VOP_F32_F32, AMDGPUfract
>;
-defm V_TRUNC_F32 : VOP1Inst <vop1<0x21>, "v_trunc_f32",
+defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32",
VOP_F32_F32, ftrunc
>;
-defm V_CEIL_F32 : VOP1Inst <vop1<0x22>, "v_ceil_f32",
+defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32",
VOP_F32_F32, fceil
>;
-defm V_RNDNE_F32 : VOP1Inst <vop1<0x23>, "v_rndne_f32",
+defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32",
VOP_F32_F32, frint
>;
-defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32",
+defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
VOP_F32_F32, ffloor
>;
-defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32",
+defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
VOP_F32_F32, fexp2
>;
-defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
-defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32",
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
VOP_F32_F32, flog2
>;
-
-defm V_RCP_CLAMP_F32 : VOP1Inst <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
-defm V_RCP_LEGACY_F32 : VOP1Inst <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
-defm V_RCP_F32 : VOP1Inst <vop1<0x2a>, "v_rcp_f32",
+defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32",
VOP_F32_F32, AMDGPUrcp
>;
-defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b>, "v_rcp_iflag_f32", VOP_F32_F32>;
-defm V_RSQ_CLAMP_F32 : VOP1Inst <vop1<0x2c>, "v_rsq_clamp_f32",
- VOP_F32_F32, AMDGPUrsq_clamped
+defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
+ VOP_F32_F32
>;
-defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32",
- VOP_F32_F32, AMDGPUrsq_legacy
->;
-defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32",
+defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
VOP_F32_F32, AMDGPUrsq
>;
-defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64",
+
+} //let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
VOP_F64_F64, AMDGPUrcp
>;
-defm V_RCP_CLAMP_F64 : VOP1Inst <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
-defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64",
+defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
VOP_F64_F64, AMDGPUrsq
>;
-defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64",
- VOP_F64_F64, AMDGPUrsq_clamped
->;
-defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32",
+
+} // let SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
VOP_F32_F32, fsqrt
>;
-defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64",
+
+let SchedRW = [WriteDouble] in {
+
+defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
VOP_F64_F64, fsqrt
>;
-defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32",
+
+} // let SchedRW = [WriteDouble]
+
+defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
VOP_F32_F32, AMDGPUsin
>;
-defm V_COS_F32 : VOP1Inst <vop1<0x36>, "v_cos_f32",
+defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
VOP_F32_F32, AMDGPUcos
>;
-defm V_NOT_B32 : VOP1Inst <vop1<0x37>, "v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <vop1<0x38>, "v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <vop1<0x39>, "v_ffbh_u32", VOP_I32_I32>;
-defm V_FFBL_B32 : VOP1Inst <vop1<0x3a>, "v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <vop1<0x3b>, "v_ffbh_i32", VOP_I32_I32>;
-//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>;
-defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d>, "v_frexp_mant_f64", VOP_F64_F64>;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e>, "v_fract_f64", VOP_F64_F64>;
-//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>;
-defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40>, "v_frexp_mant_f32", VOP_F32_F32>;
-//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>;
-defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42>, "v_movreld_b32", VOP_I32_I32>;
-defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43>, "v_movrels_b32", VOP_I32_I32>;
-defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>;
+defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
+ VOP_I32_F64
+>;
+defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
+ VOP_F64_F64
+>;
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
+ VOP_I32_F32
+>;
+defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
+ VOP_F32_F32
+>;
+let vdst = 0, src0 = 0 in {
+defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
+ "v_clrexcp"
+>;
+}
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
+defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
+ VOP_F32_F32, AMDGPUrsq_clamped
+>;
+defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
+ VOP_F32_F32, AMDGPUrsq_legacy
+>;
+
+} // End let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
+ VOP_F64_F64, AMDGPUrsq_clamped
+>;
+
+} // End SchedRW = [WriteDouble]
+} // End SubtargetPredicate = isSICI
//===----------------------------------------------------------------------===//
// VINTRP Instructions
//===----------------------------------------------------------------------===//
-def V_INTERP_P1_F32 : VINTRP <
- 0x00000000,
- (outs VReg_32:$dst),
- (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+// FIXME: Specify SchedRW for VINTRP insturctions.
+defm V_INTERP_P1_F32 : VINTRP_m <
+ 0x00000000, "v_interp_p1_f32",
+ (outs VGPR_32:$dst),
+ (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
"v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]",
- []> {
- let DisableEncoding = "$m0";
-}
+ "$m0">;
-def V_INTERP_P2_F32 : VINTRP <
- 0x00000001,
- (outs VReg_32:$dst),
- (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+defm V_INTERP_P2_F32 : VINTRP_m <
+ 0x00000001, "v_interp_p2_f32",
+ (outs VGPR_32:$dst),
+ (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
"v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
- []> {
-
- let Constraints = "$src0 = $dst";
- let DisableEncoding = "$src0,$m0";
+ "$src0,$m0",
+ "$src0 = $dst">;
-}
-
-def V_INTERP_MOV_F32 : VINTRP <
- 0x00000002,
- (outs VReg_32:$dst),
+defm V_INTERP_MOV_F32 : VINTRP_m <
+ 0x00000002, "v_interp_mov_f32",
+ (outs VGPR_32:$dst),
(ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
"v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]",
- []> {
- let DisableEncoding = "$m0";
-}
+ "$m0">;
//===----------------------------------------------------------------------===//
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
- (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
- "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]",
- []
->{
- let DisableEncoding = "$vcc";
-}
-
-def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
+defm V_CNDMASK_B32_e64 : VOP3_m_nomods <vop3<0x100>, (outs VGPR_32:$dst),
(ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2),
"v_cndmask_b32_e64 $dst, $src0, $src1, $src2",
- [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
-> {
- let src0_modifiers = 0;
- let src1_modifiers = 0;
- let src2_modifiers = 0;
-}
-
-def V_READLANE_B32 : VOP2 <
- 0x00000001,
- (outs SReg_32:$vdst),
- (ins VReg_32:$src0, SSrc_32:$vsrc1),
- "v_readlane_b32 $vdst, $src0, $vsrc1",
- []
+ [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))],
+ "v_cndmask_b32_e64", 3
>;
-def V_WRITELANE_B32 : VOP2 <
- 0x00000002,
- (outs VReg_32:$vdst),
- (ins SReg_32:$src0, SSrc_32:$vsrc1),
- "v_writelane_b32 $vdst, $src0, $vsrc1",
- []
->;
let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2Inst <vop2<0x3>, "v_add_f32",
+defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
VOP_F32_F32_F32, fadd
>;
-defm V_SUB_F32 : VOP2Inst <vop2<0x4>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
-defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32",
+defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
VOP_F32_F32_F32, null_frag, "v_sub_f32"
>;
} // End isCommutable = 1
let isCommutable = 1 in {
-defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
- VOP_F32_F32_F32
->;
-
-defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32",
+defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
VOP_F32_F32_F32, int_AMDGPU_mul
>;
-defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32",
+defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
VOP_F32_F32_F32, fmul
>;
-defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24",
+defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24",
VOP_I32_I32_I32, AMDGPUmul_i24
>;
-//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>;
-defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24",
- VOP_I32_I32_I32, AMDGPUmul_u24
->;
-//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>;
-
-defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
- VOP_F32_F32_F32, AMDGPUfmin_legacy
+defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24",
+ VOP_I32_I32_I32
>;
-defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32",
- VOP_F32_F32_F32, AMDGPUfmax_legacy
+defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24",
+ VOP_I32_I32_I32, AMDGPUmul_u24
>;
-defm V_MIN_F32 : VOP2Inst <vop2<0xf>, "v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <vop2<0x10>, "v_max_f32", VOP_F32_F32_F32, fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <vop2<0x11>, "v_min_i32", VOP_I32_I32_I32, AMDGPUsmin>;
-defm V_MAX_I32 : VOP2Inst <vop2<0x12>, "v_max_i32", VOP_I32_I32_I32, AMDGPUsmax>;
-defm V_MIN_U32 : VOP2Inst <vop2<0x13>, "v_min_u32", VOP_I32_I32_I32, AMDGPUumin>;
-defm V_MAX_U32 : VOP2Inst <vop2<0x14>, "v_max_u32", VOP_I32_I32_I32, AMDGPUumax>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24",
+ VOP_I32_I32_I32
+>;
-defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>;
+defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
+ fminnum>;
+defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
+ fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>;
defm V_LSHRREV_B32 : VOP2Inst <
- vop2<0x16>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32"
+ vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
+ "v_lshr_b32"
>;
-defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32",
- VOP_I32_I32_I32, sra
->;
defm V_ASHRREV_I32 : VOP2Inst <
- vop2<0x18>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32"
+ vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
+ "v_ashr_i32"
>;
-let hasPostISelHook = 1 in {
-
-defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>;
-
-}
defm V_LSHLREV_B32 : VOP2Inst <
- vop2<0x1a>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32"
+ vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
+ "v_lshl_b32"
>;
-defm V_AND_B32 : VOP2Inst <vop2<0x1b>, "v_and_b32",
- VOP_I32_I32_I32, and>;
-defm V_OR_B32 : VOP2Inst <vop2<0x1c>, "v_or_b32",
- VOP_I32_I32_I32, or
->;
-defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32",
- VOP_I32_I32_I32, xor
->;
-
-} // End isCommutable = 1
-
-defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32",
- VOP_I32_I32_I32, AMDGPUbfm>;
+defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
-let isCommutable = 1 in {
-defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>;
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
} // End isCommutable = 1
-defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>;
+defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>;
+defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
} // End isCommutable = 1
-
-defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
-
- VOP_I32_I32_I32
->;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
- VOP_I32_I32_I32
->;
-
let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2bInst <vop2<0x25>, "v_add_i32",
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
VOP_I32_I32_I32, add
>;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26>, "v_sub_i32",
- VOP_I32_I32_I32, sub
->;
-defm V_SUBREV_I32 : VOP2bInst <vop2<0x27>, "v_subrev_i32",
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+
+defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
VOP_I32_I32_I32, null_frag, "v_sub_i32"
>;
let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2bInst <vop2<0x28>, "v_addc_u32",
- VOP_I32_I32_I32_VCC, adde
+defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
+ VOP_I32_I32_I32_VCC
>;
-defm V_SUBB_U32 : VOP2bInst <vop2<0x29>, "v_subb_u32",
- VOP_I32_I32_I32_VCC, sube
+defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
+ VOP_I32_I32_I32_VCC
>;
-defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a>, "v_subbrev_u32",
+defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
>;
} // End Uses = [VCC]
} // End isCommutable = 1, Defs = [VCC]
-defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32",
+defm V_READLANE_B32 : VOP2SI_3VI_m <
+ vop3 <0x001, 0x289>,
+ "v_readlane_b32",
+ (outs SReg_32:$vdst),
+ (ins VGPR_32:$src0, SCSrc_32:$src1),
+ "v_readlane_b32 $vdst, $src0, $src1"
+>;
+
+defm V_WRITELANE_B32 : VOP2SI_3VI_m <
+ vop3 <0x002, 0x28a>,
+ "v_writelane_b32",
+ (outs VGPR_32:$vdst),
+ (ins SReg_32:$src0, SCSrc_32:$src1),
+ "v_writelane_b32 $vdst, $src0, $src1"
+>;
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
+ VOP_F32_F32_F32, AMDGPUfmin_legacy
+>;
+defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32",
+ VOP_F32_F32_F32, AMDGPUfmax_legacy
+>;
+
+let isCommutable = 1 in {
+defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
+} // End isCommutable = 1
+} // End let SubtargetPredicate = SICI
+
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32",
+ VOP_F32_F32_F32
+>;
+} // End isCommutable = 1
+
+defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32,
+ AMDGPUbfm
+>;
+defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
+ VOP_I32_I32_I32
+>;
+defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
+ VOP_I32_I32_I32
+>;
+defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
+ VOP_I32_I32_I32
+>;
+defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
VOP_F32_F32_I32, AMDGPUldexp
>;
-////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>;
-////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>;
-////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
- VOP_I32_F32_F32, int_SI_packf16
+
+
+defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32",
+ VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+
+defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32",
+ VOP_I32_F32_F32
+>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32",
+ VOP_I32_F32_F32
+>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
+ VOP_I32_F32_F32, int_SI_packf16
+>;
+defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32",
+ VOP_I32_I32_I32
+>;
+defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32",
+ VOP_I32_I32_I32
>;
-////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>;
-////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>;
//===----------------------------------------------------------------------===//
// VOP3 Instructions
//===----------------------------------------------------------------------===//
let isCommutable = 1 in {
-defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32",
+defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32",
VOP_F32_F32_F32_F32
>;
-defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32",
+defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32",
VOP_F32_F32_F32_F32, fmad
>;
-defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24",
+defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24",
VOP_I32_I32_I32_I32, AMDGPUmad_i24
>;
-defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24",
+defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
VOP_I32_I32_I32_I32, AMDGPUmad_u24
>;
} // End isCommutable = 1
-defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32",
+defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
VOP_F32_F32_F32_F32
>;
-defm V_CUBESC_F32 : VOP3Inst <vop3<0x145>, "v_cubesc_f32",
+defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
VOP_F32_F32_F32_F32
>;
-defm V_CUBETC_F32 : VOP3Inst <vop3<0x146>, "v_cubetc_f32",
+defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
VOP_F32_F32_F32_F32
>;
-defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147>, "v_cubema_f32",
+defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
VOP_F32_F32_F32_F32
>;
-defm V_BFE_U32 : VOP3Inst <vop3<0x148>, "v_bfe_u32",
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
VOP_I32_I32_I32_I32, AMDGPUbfe_u32
>;
-defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32",
+defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32",
VOP_I32_I32_I32_I32, AMDGPUbfe_i32
>;
-defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32",
+}
+
+defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32",
VOP_I32_I32_I32_I32, AMDGPUbfi
>;
let isCommutable = 1 in {
-defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32",
+defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
VOP_F32_F32_F32_F32, fma
>;
-defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64",
+defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
VOP_F64_F64_F64_F64, fma
>;
} // End isCommutable = 1
//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
-defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32",
+defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32",
VOP_I32_I32_I32_I32
>;
-defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f>, "v_alignbyte_b32",
+defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
VOP_I32_I32_I32_I32
>;
-defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
- VOP_F32_F32_F32_F32>;
-defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32",
+
+defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32",
VOP_F32_F32_F32_F32, AMDGPUfmin3>;
-defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32",
+defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32",
VOP_I32_I32_I32_I32, AMDGPUsmin3
>;
-defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32",
+defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32",
VOP_I32_I32_I32_I32, AMDGPUumin3
>;
-defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32",
+defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32",
VOP_F32_F32_F32_F32, AMDGPUfmax3
>;
-defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32",
+defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32",
VOP_I32_I32_I32_I32, AMDGPUsmax3
>;
-defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32",
+defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
VOP_I32_I32_I32_I32, AMDGPUumax3
>;
-//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>;
-//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>;
-//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>;
+defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
+ VOP_F32_F32_F32_F32
+>;
+defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
+ VOP_I32_I32_I32_I32
+>;
+defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
+ VOP_I32_I32_I32_I32
+>;
+
//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
-defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32",
+defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
VOP_I32_I32_I32_I32
>;
////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
defm V_DIV_FIXUP_F32 : VOP3Inst <
- vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
+ vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
>;
+
+let SchedRW = [WriteDouble] in {
+
defm V_DIV_FIXUP_F64 : VOP3Inst <
- vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
+ vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
>;
-defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
- VOP_I64_I64_I32, shl
->;
-defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
- VOP_I64_I64_I32, srl
->;
-defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
- VOP_I64_I64_I32, sra
->;
+} // let SchedRW = [WriteDouble]
+let SchedRW = [WriteDouble] in {
let isCommutable = 1 in {
-defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64",
+defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
VOP_F64_F64_F64, fadd
>;
-defm V_MUL_F64 : VOP3Inst <vop3<0x165>, "v_mul_f64",
+defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
VOP_F64_F64_F64, fmul
>;
-defm V_MIN_F64 : VOP3Inst <vop3<0x166>, "v_min_f64",
+defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
VOP_F64_F64_F64, fminnum
>;
-defm V_MAX_F64 : VOP3Inst <vop3<0x167>, "v_max_f64",
+defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
VOP_F64_F64_F64, fmaxnum
>;
} // isCommutable = 1
-defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64",
+defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
VOP_F64_F64_I32, AMDGPUldexp
>;
-let isCommutable = 1 in {
+} // let SchedRW = [WriteDouble]
+
+let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
-defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32",
+defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
VOP_I32_I32_I32
>;
-defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a>, "v_mul_hi_u32",
+defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
VOP_I32_I32_I32
>;
-defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b>, "v_mul_lo_i32",
+
+defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
VOP_I32_I32_I32
>;
-defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32",
+defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
VOP_I32_I32_I32
>;
-} // isCommutable = 1
+} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>;
+let SchedRW = [WriteFloatFMA, WriteSALU] in {
+defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+}
+let SchedRW = [WriteDouble, WriteSALU] in {
// Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
+defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
-let isCommutable = 1 in {
-defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
+let isCommutable = 1, Uses = [VCC] in {
+
+// v_div_fmas_f32:
+// result = src0 * src1 + src2
+// if (vcc)
+// result *= 2^32
+//
+defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
-defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
+
+let SchedRW = [WriteDouble] in {
+// v_div_fmas_f64:
+// result = src0 * src1 + src2
+// if (vcc)
+// result *= 2^64
+//
+defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>;
+
+} // End SchedRW = [WriteDouble]
} // End isCommutable = 1
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+let SchedRW = [WriteDouble] in {
defm V_TRIG_PREOP_F64 : VOP3Inst <
- vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
+ vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
>;
-//===----------------------------------------------------------------------===//
-// Pseudo Instructions
-//===----------------------------------------------------------------------===//
+} // let SchedRW = [WriteDouble]
-let isCodeGenOnly = 1, isPseudo = 1 in {
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
-def V_MOV_I1 : InstSI <
- (outs VReg_1:$dst),
- (ins i1imm:$src),
- "", [(set i1:$dst, (imm:$src))]
->;
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>;
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>;
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>;
-def V_AND_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (and i1:$src0, i1:$src1))]
->;
+defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
+ VOP_F32_F32_F32_F32>;
-def V_OR_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (or i1:$src0, i1:$src1))]
->;
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = isVI in {
-def V_XOR_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
+ VOP_I64_I32_I64
+>;
+defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64",
+ VOP_I64_I32_I64
+>;
+defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
+ VOP_I64_I32_I64
>;
+} // End SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
+// pass to enable folding of inline immediates.
+def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
+} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
let hasSideEffects = 1 in {
def SGPR_USE : InstSI <(outs),(ins), "", []>;
}
@@ -1785,12 +1916,12 @@ def SI_KILL : InstSI <
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
-//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>;
+//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
let UseNamedOperandTable = 1 in {
def SI_RegisterLoad : InstSI <
- (outs VReg_32:$dst, SReg_64:$temp),
+ (outs VGPR_32:$dst, SReg_64:$temp),
(ins FRAMEri32:$addr, i32imm:$chan),
"", []
> {
@@ -1800,7 +1931,7 @@ def SI_RegisterLoad : InstSI <
class SIRegStore<dag outs> : InstSI <
outs,
- (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
+ (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
"", []
> {
let isRegisterStore = 1;
@@ -1816,7 +1947,7 @@ def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
} // End UseNamedOperandTable = 1
def SI_INDIRECT_SRC : InstSI <
- (outs VReg_32:$dst, SReg_64:$temp),
+ (outs VGPR_32:$dst, SReg_64:$temp),
(ins unknown:$src, VSrc_32:$idx, i32imm:$off),
"si_indirect_src $dst, $temp, $src, $idx, $off",
[]
@@ -1824,14 +1955,14 @@ def SI_INDIRECT_SRC : InstSI <
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
(outs rc:$dst, SReg_64:$temp),
- (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
+ (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
"si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
[]
> {
let Constraints = "$src = $dst";
}
-def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@@ -1839,31 +1970,22 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
-let usesCustomInserter = 1 in {
-
-def V_SUB_F64 : InstSI <
- (outs VReg_64:$dst),
- (ins VReg_64:$src0, VReg_64:$src1),
- "v_sub_f64 $dst, $src0, $src1",
- [(set f64:$dst, (fsub f64:$src0, f64:$src1))]
->;
-
-} // end usesCustomInserter
-
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
- def _SAVE : InstSI <
- (outs),
- (ins sgpr_class:$src, i32imm:$frame_idx),
- "", []
- >;
-
- def _RESTORE : InstSI <
- (outs sgpr_class:$dst),
- (ins i32imm:$frame_idx),
- "", []
- >;
-
+ let UseNamedOperandTable = 1 in {
+ def _SAVE : InstSI <
+ (outs),
+ (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+ SReg_32:$scratch_offset),
+ "", []
+ >;
+
+ def _RESTORE : InstSI <
+ (outs sgpr_class:$dst),
+ (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+ "", []
+ >;
+ } // End UseNamedOperandTable = 1
}
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
@@ -1873,20 +1995,23 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
- def _SAVE : InstSI <
- (outs),
- (ins vgpr_class:$src, i32imm:$frame_idx),
- "", []
- >;
-
- def _RESTORE : InstSI <
- (outs vgpr_class:$dst),
- (ins i32imm:$frame_idx),
- "", []
- >;
+ let UseNamedOperandTable = 1 in {
+ def _SAVE : InstSI <
+ (outs),
+ (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+ SReg_32:$scratch_offset),
+ "", []
+ >;
+
+ def _RESTORE : InstSI <
+ (outs vgpr_class:$dst),
+ (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+ "", []
+ >;
+ } // End UseNamedOperandTable = 1
}
-defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>;
+defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
@@ -1905,9 +2030,9 @@ def SI_CONSTDATA_PTR : InstSI <
} // end IsCodeGenOnly, isPseudo
-} // end SubtargetPredicate = SI
+} // end SubtargetPredicate = isGCN
-let Predicates = [isSI] in {
+let Predicates = [isGCN] in {
def : Pat<
(int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
@@ -1941,7 +2066,7 @@ def : Pat <
multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
- // 1. Offset as 8bit DWORD immediate
+ // 1. SI-CI: Offset as 8bit DWORD immediate
def : Pat <
(constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
(vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
@@ -1960,6 +2085,28 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
>;
}
+multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+ // 1. VI: Offset as 20bit immediate in bytes
+ def : Pat <
+ (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
+ (vt (Instr_IMM $sbase, (as_i32imm $offset)))
+ >;
+
+ // 2. Offset loaded in an 32bit SGPR
+ def : Pat <
+ (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+ (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+ >;
+
+ // 3. No offset at all
+ def : Pat <
+ (constant_load i64:$sbase),
+ (vt (Instr_IMM $sbase, 0))
+ >;
+}
+
+let Predicates = [isSICI] in {
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
@@ -1967,6 +2114,19 @@ defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isSICI]
+
+let Predicates = [isVI] in {
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isVI]
+
+let Predicates = [isSICI] in {
// 1. Offset as 8bit DWORD immediate
def : Pat <
@@ -1974,14 +2134,14 @@ def : Pat <
(S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
>;
+} // End Predicates = [isSICI]
+
// 2. Offset loaded in an 32bit SGPR
def : Pat <
(SIload_constant v4i32:$sbase, imm:$offset),
(S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
>;
-} // Predicates = [isSI] in {
-
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -2004,8 +2164,6 @@ def : Pat <
(S_ADD_U32 $src0, $src1)
>;
-let Predicates = [isSI] in {
-
//===----------------------------------------------------------------------===//
// SOPP Patterns
//===----------------------------------------------------------------------===//
@@ -2020,9 +2178,13 @@ def : Pat <
//===----------------------------------------------------------------------===//
let Predicates = [UnsafeFPMath] in {
-def : RcpPat<V_RCP_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+//def : RcpPat<V_RCP_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+def : RsqPat<V_RSQ_F32_e32, f32>;
+def : RsqPat<V_RSQ_F64_e32, f64>;
}
//===----------------------------------------------------------------------===//
@@ -2369,10 +2531,10 @@ foreach Index = 0-15 in {
}
def : BitConvert <i32, f32, SReg_32>;
-def : BitConvert <i32, f32, VReg_32>;
+def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, SReg_32>;
-def : BitConvert <f32, i32, VReg_32>;
+def : BitConvert <f32, i32, VGPR_32>;
def : BitConvert <i64, f64, VReg_64>;
@@ -2475,7 +2637,7 @@ def : Pat <
def : Pat <
(SGPRImm<(f32 fpimm)>:$imm),
- (S_MOV_B32 fpimm:$imm)
+ (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
@@ -2485,7 +2647,7 @@ def : Pat <
def : Pat <
(f32 fpimm:$imm),
- (V_MOV_B32_e32 fpimm:$imm)
+ (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
@@ -2493,21 +2655,38 @@ def : Pat <
(S_MOV_B64 InlineImm<i64>:$imm)
>;
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+ (i1 imm:$imm),
+ (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
+def : Pat <
+ (f64 InlineFPImm<f64>:$imm),
+ (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
+>;
+
/********** ===================== **********/
/********** Interpolation Paterns **********/
/********** ===================== **********/
+// The value of $params is constant through out the entire kernel.
+// We need to use S_MOV_B32 $params, because CSE ignores copies, so
+// without it we end up with a lot of redundant moves.
+
def : Pat <
(int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
- (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
+ (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
>;
def : Pat <
- (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
+ (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij),
(V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
- imm:$attr_chan, imm:$attr, i32:$params),
+ imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)),
(EXTRACT_SUBREG $ij, sub1),
- imm:$attr_chan, imm:$attr, $params)
+ imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
>;
/********** ================== **********/
@@ -2522,13 +2701,6 @@ def : Pat <
(V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
>;
-def : Pat<
- (fdiv f64:$src0, f64:$src1),
- (V_MUL_F64 0 /* src0_modifiers */, $src0,
- 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1),
- 0 /* clamp */, 0 /* omod */)
->;
-
def : Pat <
(int_AMDGPU_cube v4f32:$src),
(REG_SEQUENCE VReg_128,
@@ -2579,7 +2751,7 @@ def : Pat <
def : Pat <
(int_SI_tid),
- (V_MBCNT_HI_U32_B32_e32 0xffffffff,
+ (V_MBCNT_HI_U32_B32_e64 0xffffffff,
(V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
>;
@@ -2600,9 +2772,6 @@ def : Pat <
(V_MUL_HI_I32 $src0, $src1)
>;
-def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>;
-
-
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
@@ -2612,7 +2781,7 @@ def : ROTRPattern <V_ALIGNBIT_B32>;
class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst (i1 0), $ptr, (as_i16imm $offset))
+ (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1))
>;
def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>;
@@ -2630,12 +2799,12 @@ def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
def : Pat <
(v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
i8:$offset1))),
- (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1)
+ (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1))
>;
class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+ (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
>;
def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
@@ -2651,12 +2820,13 @@ def : Pat <
(local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
i8:$offset1)),
(DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0),
- (EXTRACT_SUBREG $value, sub1), $offset0, $offset1)
+ (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
+ (S_MOV_B32 -1))
>;
class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+ (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
>;
// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
@@ -2672,13 +2842,13 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
class DSAtomicIncRetPat<DS inst, ValueType vt,
Instruction LoadImm, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
- (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
+ (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1))
>;
class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
+ (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1))
>;
@@ -2728,11 +2898,12 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
PatFrag constant_ld> {
def : Pat <
- (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))),
- (Instr_ADDR64 $srsrc, $vaddr, $offset)
+ (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+ (Instr_ADDR64 $srsrc, $vaddr, $soffset, $offset)
>;
}
+let Predicates = [isSICI] in {
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
@@ -2740,6 +2911,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+} // End Predicates = [isSICI]
class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
(vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
@@ -2785,9 +2957,9 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
def : Pat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
- imm, 1, 1, imm:$glc, imm:$slc,
+ imm:$offset, 1, 1, imm:$glc, imm:$slc,
imm:$tfe)),
- (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+ (bothen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
(as_i1imm $tfe))
>;
}
@@ -2817,11 +2989,13 @@ class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
(Instr $value, $srsrc, $vaddr, $offset)
>;
+let Predicates = [isSICI] in {
def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+} // End Predicates = [isSICI]
*/
@@ -2848,20 +3022,6 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
let SubtargetPredicate = isCI in {
-// Sea island new arithmetic instructinos
-defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
- VOP_F64_F64, ftrunc
->;
-defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
- VOP_F64_F64, fceil
->;
-defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
- VOP_F64_F64, ffloor
->;
-defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
- VOP_F64_F64, frint
->;
-
defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
VOP_I32_I32_I32
>;
@@ -2890,8 +3050,6 @@ defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
// S_CBRANCH_CDBGSYS_OR_USER
// S_CBRANCH_CDBGSYS_AND_USER
// S_DCACHE_INV_VOL
-// V_EXP_LEGACY_F32
-// V_LOG_LEGACY_F32
// DS_NOP
// DS_GWS_SEMA_RELEASE_ALL
// DS_WRAP_RTN_B32
@@ -2904,7 +3062,7 @@ defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
// BUFFER_LOAD_DWORDX3
// BUFFER_STORE_DWORDX3
-} // End iSCI
+} // End isCI
//===----------------------------------------------------------------------===//
// Flat Patterns
@@ -3038,6 +3196,27 @@ def : Pat <
(V_CNDMASK_B32_e64 0, -1, $src), sub1)
>;
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisions still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
@@ -3050,7 +3229,7 @@ def : Pat <
def : Pat <
(f64 (sint_to_fp i1:$src)),
- (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
def : Pat <
@@ -3073,16 +3252,27 @@ def : Pat <
>;
def : Pat <
+ (i1 (trunc i64:$a)),
+ (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+ (EXTRACT_SUBREG $a, sub0)), 1)
+>;
+
+def : Pat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 0x00ff00ff),
(V_ALIGNBIT_B32 $a, $a, 24),
(V_ALIGNBIT_B32 $a, $a, 8))
>;
+def : Pat <
+ (f32 (select i1:$src2, f32:$src1, f32:$src0)),
+ (V_CNDMASK_B32_e64 $src0, $src1, $src2)
+>;
+
//============================================================================//
// Miscellaneous Optimization Patterns
//============================================================================//
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
-} // End isSI predicate
+} // End isGCN predicate
diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
index 4140196..46630d0 100644
--- a/lib/Target/R600/SILoadStoreOptimizer.cpp
+++ b/lib/Target/R600/SILoadStoreOptimizer.cpp
@@ -55,7 +55,6 @@ namespace {
class SILoadStoreOptimizer : public MachineFunctionPass {
private:
- const TargetMachine *TM;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
@@ -86,20 +85,11 @@ private:
public:
static char ID;
- SILoadStoreOptimizer() :
- MachineFunctionPass(ID),
- TM(nullptr),
- TII(nullptr),
- TRI(nullptr),
- MRI(nullptr),
- LIS(nullptr) {
+ SILoadStoreOptimizer()
+ : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
+ LIS(nullptr) {}
- }
-
- SILoadStoreOptimizer(const TargetMachine &TM_) :
- MachineFunctionPass(ID),
- TM(&TM_),
- TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) {
+ SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
}
@@ -222,6 +212,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
// Be careful, since the addresses could be subregisters themselves in weird
// cases, like vectors of pointers.
const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+ const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
unsigned DestReg1
@@ -262,6 +253,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
.addOperand(*AddrReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
+ .addOperand(*M0Reg) // M0
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
@@ -280,6 +272,18 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
LIS->shrinkToUses(&AddrRegLI);
+ LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
+ LIS->shrinkToUses(&M0RegLI);
+
+ // Currently m0 is treated as a register class with one member instead of an
+ // implicit physical register. We are using the virtual register for the first
+ // one, but we still need to update the live range of the now unused second m0
+ // virtual register to avoid verifier errors.
+ const MachineOperand *PairedM0Reg
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
+ LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
+ LIS->shrinkToUses(&PairedM0RegLI);
+
LIS->getInterval(DestReg); // Create new LI
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
@@ -295,6 +299,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+ const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
const MachineOperand *Data1
= TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
@@ -333,11 +338,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
.addOperand(*Data1) // data1
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
+ .addOperand(*M0Reg) // m0
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
// XXX - How do we express subregisters here?
- unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+ unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
+ M0Reg->getReg()};
LIS->RemoveMachineInstrFromMaps(I);
LIS->RemoveMachineInstrFromMaps(Paired);
@@ -397,9 +404,9 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
}
bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
- const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl();
- TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo());
- TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo());
+ const TargetSubtargetInfo &STM = MF.getSubtarget();
+ TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+ TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo());
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 9702565..2e08c9f 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -88,7 +88,6 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
- void InitM0ForLDS(MachineBasicBlock::iterator MI);
void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
void IndirectSrc(MachineInstr &MI);
void IndirectDst(MachineInstr &MI);
@@ -309,10 +308,9 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
#endif
// Clear this thread from the exec mask if the operand is negative
- if ((Op.isImm() || Op.isFPImm())) {
+ if ((Op.isImm())) {
// Constant operand: Set exec mask to 0 or do nothing
- if (Op.isImm() ? (Op.getImm() & 0x80000000) :
- Op.getFPImm()->isNegative()) {
+ if (Op.getImm() & 0x80000000) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addImm(0);
}
@@ -325,14 +323,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MI.eraseFromParent();
}
-/// The m0 register stores the maximum allowable address for LDS reads and
-/// writes. Its value must be at least the size in bytes of LDS allocated by
-/// the shader. For simplicity, we set it to the maximum possible value.
-void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::M0).addImm(0xffffffff);
-}
-
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
MachineBasicBlock &MBB = *MI.getParent();
@@ -349,7 +339,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
} else {
assert(AMDGPU::SReg_64RegClass.contains(Save));
- assert(AMDGPU::VReg_32RegClass.contains(Idx));
+ assert(AMDGPU::VGPR_32RegClass.contains(Idx));
// Save the EXEC mask
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
@@ -391,12 +381,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
.addReg(Save);
}
- // FIXME: Are there any values other than the LDS address clamp that need to
- // be stored in the m0 register and may be live for more than a few
- // instructions? If so, we should save the m0 register at the beginning
- // of this function and restore it here.
- // FIXME: Add support for LDS direct loads.
- InitM0ForLDS(&MI);
MI.eraseFromParent();
}
@@ -450,7 +434,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
- bool NeedM0 = false;
bool NeedWQM = false;
bool NeedFlat = false;
unsigned Depth = 0;
@@ -464,16 +447,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (TII->isDS(MI.getOpcode())) {
- NeedM0 = true;
+ if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
NeedWQM = true;
- }
// Flat uses m0 in case it needs to access LDS.
- if (TII->isFLAT(MI.getOpcode())) {
- NeedM0 = true;
+ if (TII->isFLAT(MI.getOpcode()))
NeedFlat = true;
- }
switch (MI.getOpcode()) {
default: break;
@@ -534,23 +513,10 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SI_INDIRECT_DST_V16:
IndirectDst(MI);
break;
-
- case AMDGPU::V_INTERP_P1_F32:
- case AMDGPU::V_INTERP_P2_F32:
- case AMDGPU::V_INTERP_MOV_F32:
- NeedWQM = true;
- break;
}
}
}
- if (NeedM0) {
- MachineBasicBlock &MBB = MF.front();
- // Initialize M0 to a value that won't cause LDS access to be discarded
- // due to offset clamping
- InitM0ForLDS(MBB.getFirstNonPHI());
- }
-
if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
index 65b892c..67421e2 100644
--- a/lib/Target/R600/SILowerI1Copies.cpp
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -85,30 +85,6 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
- continue;
- }
-
- if (MI.getOpcode() == AMDGPU::V_AND_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
- continue;
- }
-
- if (MI.getOpcode() == AMDGPU::V_OR_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
- continue;
- }
-
- if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32));
- continue;
- }
-
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
unsigned Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
@@ -117,39 +93,59 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- if (MI.getOpcode() != AMDGPU::COPY ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+ if (MI.getOpcode() != AMDGPU::COPY)
continue;
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Src = MI.getOperand(1);
+
+ if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+ !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ continue;
- const TargetRegisterClass *DstRC =
- MRI.getRegClass(MI.getOperand(0).getReg());
- const TargetRegisterClass *SrcRC =
- MRI.getRegClass(MI.getOperand(1).getReg());
+ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
if (DstRC == &AMDGPU::VReg_1RegClass &&
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
- .addOperand(MI.getOperand(0))
- .addImm(0)
- .addImm(-1)
- .addOperand(MI.getOperand(1));
+ I1Defs.push_back(Dst.getReg());
+ DebugLoc DL = MI.getDebugLoc();
+
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+ if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+ if (DefInst->getOperand(1).isImm()) {
+ I1Defs.push_back(Dst.getReg());
+
+ int64_t Val = DefInst->getOperand(1).getImm();
+ assert(Val == 0 || Val == -1);
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+ .addOperand(Dst)
+ .addImm(Val);
+ MI.eraseFromParent();
+ continue;
+ }
+ }
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addOperand(Dst)
+ .addImm(0)
+ .addImm(-1)
+ .addOperand(Src);
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
- .addOperand(MI.getOperand(0))
- .addOperand(MI.getOperand(1))
- .addImm(0);
+ .addOperand(Dst)
+ .addOperand(Src)
+ .addImm(0);
MI.eraseFromParent();
}
}
}
for (unsigned Reg : I1Defs)
- MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+ MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
return false;
}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index d58f31d..587ea63 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -29,6 +29,7 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
+ HasSpilledVGPRs(false),
PSInputAddr(0),
NumUserSGPRs(0),
LDSWaveSpillSize(0) { }
@@ -38,8 +39,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
unsigned FrameIndex,
unsigned SubIdx) {
const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
- const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(
- MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+ MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
MachineRegisterInfo &MRI = MF->getRegInfo();
int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
Offset += SubIdx * 4;
@@ -50,7 +51,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
struct SpilledReg Spill;
if (!LaneVGPRs.count(LaneVGPRIdx)) {
- unsigned LaneVGPR = TRI->findUnusedVGPR(MRI);
+ unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
MRI.setPhysRegUsed(LaneVGPR);
@@ -69,7 +70,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
const MachineFunction &MF) const {
- const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
// FIXME: We should get this information from kernel attributes if it
// is available.
return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 6bb8f9d..667da4c 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -29,6 +29,7 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
void anchor() override;
unsigned TIDReg;
+ bool HasSpilledVGPRs;
public:
@@ -49,9 +50,12 @@ public:
unsigned NumUserSGPRs;
std::map<unsigned, unsigned> LaneVGPRs;
unsigned LDSWaveSpillSize;
+ unsigned ScratchOffsetReg;
bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+ bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
+ void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};
diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp
new file mode 100644
index 0000000..0a57a5b
--- /dev/null
+++ b/lib/Target/R600/SIPrepareScratchRegs.cpp
@@ -0,0 +1,208 @@
+//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This pass loads scratch pointer and scratch offset into a register or a
+/// frame index which can be used anywhere in the program. These values will
+/// be used for spilling VGPRs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIPrepareScratchRegs : public MachineFunctionPass {
+
+private:
+ static char ID;
+
+public:
+ SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "SI prepare scratch registers";
+ }
+
+};
+
+} // End anonymous namespace
+
+char SIPrepareScratchRegs::ID = 0;
+
+FunctionPass *llvm::createSIPrepareScratchRegs() {
+ return new SIPrepareScratchRegs();
+}
+
+bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+ MachineBasicBlock *Entry = MF.begin();
+ MachineBasicBlock::iterator I = Entry->begin();
+ DebugLoc DL = I->getDebugLoc();
+
+ // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
+ // run this pass.
+ if (!MFI->hasSpilledVGPRs())
+ return false;
+
+ unsigned ScratchPtrPreloadReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+ unsigned ScratchOffsetPreloadReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
+ if (!Entry->isLiveIn(ScratchPtrPreloadReg))
+ Entry->addLiveIn(ScratchPtrPreloadReg);
+
+ if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
+ Entry->addLiveIn(ScratchOffsetPreloadReg);
+
+ // Load the scratch offset.
+ unsigned ScratchOffsetReg =
+ TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
+ int ScratchOffsetFI = -1;
+
+ if (ScratchOffsetReg != AMDGPU::NoRegister) {
+ // Found an SGPR to use
+ MRI.setPhysRegUsed(ScratchOffsetReg);
+ BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
+ .addReg(ScratchOffsetPreloadReg);
+ } else {
+ // No SGPR is available, we must spill.
+ ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
+ BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
+ .addReg(ScratchOffsetPreloadReg)
+ .addFrameIndex(ScratchOffsetFI)
+ .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+ .addReg(AMDGPU::SGPR0, RegState::Undef);
+ }
+
+
+ // Now that we have the scratch pointer and offset values, we need to
+ // add them to all the SI_SPILL_V* instructions.
+
+ RegScavenger RS;
+ unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
+ RS.addScavengingFrameIndex(ScratchRsrcFI);
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ // Add the scratch offset reg as a live-in so that the register scavenger
+ // doesn't re-use it.
+ if (!MBB.isLiveIn(ScratchOffsetReg) &&
+ ScratchOffsetReg != AMDGPU::NoRegister)
+ MBB.addLiveIn(ScratchOffsetReg);
+ RS.enterBasicBlock(&MBB);
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ RS.forward(I);
+ DebugLoc DL = MI.getDebugLoc();
+ switch(MI.getOpcode()) {
+ default: break;
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+
+ // Scratch resource
+ unsigned ScratchRsrcReg =
+ RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
+
+ uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+ 0xffffffff; // Size
+
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
+ .addImm(Rsrc & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
+ .addImm(Rsrc >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ // Scratch Offset
+ if (ScratchOffsetReg == AMDGPU::NoRegister) {
+ ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
+ ScratchOffsetReg)
+ .addFrameIndex(ScratchOffsetFI)
+ .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
+ .addReg(AMDGPU::SGPR0, RegState::Undef);
+ } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
+ MBB.addLiveIn(ScratchOffsetReg);
+ }
+
+ if (ScratchRsrcReg == AMDGPU::NoRegister ||
+ ScratchOffsetReg == AMDGPU::NoRegister) {
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ Ctx.emitError("ran out of SGPRs for spilling VGPRs");
+ ScratchRsrcReg = AMDGPU::SGPR0;
+ ScratchOffsetReg = AMDGPU::SGPR0;
+ }
+ MI.getOperand(2).setReg(ScratchRsrcReg);
+ MI.getOperand(2).setIsKill(true);
+ MI.getOperand(2).setIsUndef(false);
+ MI.getOperand(3).setReg(ScratchOffsetReg);
+ MI.getOperand(3).setIsUndef(false);
+ MI.getOperand(3).setIsKill(false);
+ MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
+ MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
+ MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
+ MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
+
+ break;
+ }
+ }
+ }
+ return true;
+}
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index cffea12..9224e14 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -40,6 +40,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
Reserved.set(AMDGPU::FLAT_SCR);
+ Reserved.set(AMDGPU::FLAT_SCR_LO);
+ Reserved.set(AMDGPU::FLAT_SCR_HI);
// Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
Reserved.set(AMDGPU::VGPR255);
@@ -48,9 +50,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
-unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const {
- return RC->getNumRegs();
+unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+
+ // FIXME: We should adjust the max number of waves based on LDS size.
+ unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
+ unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());
+
+ for (regclass_iterator I = regclass_begin(), E = regclass_end();
+ I != E; ++I) {
+
+ unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+ unsigned Limit;
+
+ if (isSGPRClass(*I)) {
+ Limit = SGPRLimit / NumSubRegs;
+ } else {
+ Limit = VGPRLimit / NumSubRegs;
+ }
+
+ const int *Sets = getRegClassPressureSets(*I);
+ assert(Sets);
+ for (unsigned i = 0; Sets[i] != -1; ++i) {
+ if (Sets[i] == (int)Idx)
+ return Limit;
+ }
+ }
+ return 256;
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -92,6 +117,60 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
}
}
+void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp,
+ unsigned Value,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffset,
+ int64_t Offset,
+ RegScavenger *RS) const {
+
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+ MachineBasicBlock *MBB = MI->getParent();
+ const MachineFunction *MF = MI->getParent()->getParent();
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ DebugLoc DL = MI->getDebugLoc();
+ bool IsLoad = TII->get(LoadStoreOp).mayLoad();
+
+ bool RanOutOfSGPRs = false;
+ unsigned SOffset = ScratchOffset;
+
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned Size = NumSubRegs * 4;
+
+ if (!isUInt<12>(Offset + Size)) {
+ SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+ if (SOffset == AMDGPU::NoRegister) {
+ RanOutOfSGPRs = true;
+ SOffset = AMDGPU::SGPR0;
+ }
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+ .addReg(ScratchOffset)
+ .addImm(Offset);
+ Offset = 0;
+ }
+
+ if (RanOutOfSGPRs)
+ Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+ unsigned SubReg = NumSubRegs > 1 ?
+ getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
+ Value;
+ bool IsKill = (i == e - 1);
+
+ BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ .addReg(SubReg, getDefRegState(IsLoad))
+ .addReg(ScratchRsrcReg, getKillRegState(IsKill))
+ .addImm(Offset)
+ .addReg(SOffset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+ }
+}
+
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
@@ -125,7 +204,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Ctx.emitError("Ran out of VGPRs for spilling SGPR");
}
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill.VGPR)
.addReg(SubReg)
.addImm(Spill.Lane);
@@ -154,13 +235,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Ctx.emitError("Ran out of VGPRs for spilling SGPR");
}
- if (isM0) {
+ if (isM0)
SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
- }
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ SubReg)
.addReg(Spill.VGPR)
- .addImm(Spill.Lane);
+ .addImm(Spill.Lane)
+ .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
if (isM0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(SubReg);
@@ -177,71 +260,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE: {
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
- unsigned SrcReg = MI->getOperand(0).getReg();
- int64_t Offset = FrameInfo->getObjectOffset(Index);
- unsigned Size = NumSubRegs * 4;
- unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
- unsigned SubReg = NumSubRegs > 1 ?
- getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
- SrcReg;
- Offset += (i * 4);
- MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize);
-
- unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
- Offset, Size);
-
- if (AddrReg == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Ran out of VGPRs for spilling VGPRS");
- AddrReg = AMDGPU::VGPR0;
- }
-
- // Store the value in LDS
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32))
- .addImm(0) // gds
- .addReg(AddrReg, RegState::Kill) // addr
- .addReg(SubReg) // data0
- .addImm(0); // offset
- }
-
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+ FrameInfo->getObjectOffset(Index), RS);
MI->eraseFromParent();
break;
- }
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE: {
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
- unsigned DstReg = MI->getOperand(0).getReg();
- int64_t Offset = FrameInfo->getObjectOffset(Index);
- unsigned Size = NumSubRegs * 4;
- unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
- // FIXME: We could use DS_READ_B64 here to optimize for larger registers.
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
- unsigned SubReg = NumSubRegs > 1 ?
- getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
- DstReg;
-
- Offset += (i * 4);
- unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
- Offset, Size);
- if (AddrReg == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Ran out of VGPRs for spilling VGPRs");
- AddrReg = AMDGPU::VGPR0;
- }
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg)
- .addImm(0) // gds
- .addReg(AddrReg, RegState::Kill) // addr
- .addImm(0); //offset
- }
+ buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+ FrameInfo->getObjectOffset(Index), RS);
MI->eraseFromParent();
break;
}
@@ -250,11 +287,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t Offset = FrameInfo->getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
- unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
+ unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
- FIOp.ChangeToRegister(TmpReg, false);
+ FIOp.ChangeToRegister(TmpReg, false, false, true);
}
}
}
@@ -264,7 +301,7 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
default:
- case MVT::i32: return &AMDGPU::VReg_32RegClass;
+ case MVT::i32: return &AMDGPU::VGPR_32RegClass;
}
}
@@ -276,7 +313,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
static const TargetRegisterClass *BaseClasses[] = {
- &AMDGPU::VReg_32RegClass,
+ &AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
@@ -297,7 +334,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
}
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
- return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+ return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
@@ -312,7 +349,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
} else if (SRC == &AMDGPU::SCCRegRegClass) {
return &AMDGPU::VCCRegRegClass;
} else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
- return &AMDGPU::VReg_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
} else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
return &AMDGPU::VReg_64RegClass;
} else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
@@ -388,40 +425,17 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
return SubRC->getRegister(Index + Channel);
}
-bool SIRegisterInfo::regClassCanUseLiteralConstant(int RCID) const {
- switch (RCID) {
- default: return false;
- case AMDGPU::SSrc_32RegClassID:
- case AMDGPU::SSrc_64RegClassID:
- case AMDGPU::VSrc_32RegClassID:
- case AMDGPU::VSrc_64RegClassID:
- return true;
- }
-}
-
-bool SIRegisterInfo::regClassCanUseLiteralConstant(
- const TargetRegisterClass *RC) const {
- return regClassCanUseLiteralConstant(RC->getID());
+bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
+ return OpType == AMDGPU::OPERAND_REG_IMM32;
}
-bool SIRegisterInfo::regClassCanUseInlineConstant(int RCID) const {
- if (regClassCanUseLiteralConstant(RCID))
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+ if (opCanUseLiteralConstant(OpType))
return true;
- switch (RCID) {
- default: return false;
- case AMDGPU::VCSrc_32RegClassID:
- case AMDGPU::VCSrc_64RegClassID:
- return true;
- }
-}
-
-bool SIRegisterInfo::regClassCanUseInlineConstant(
- const TargetRegisterClass *RC) const {
- return regClassCanUseInlineConstant(RC->getID());
+ return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
-
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
@@ -434,6 +448,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
case SIRegisterInfo::TGID_Z:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+ if (MFI->getShaderType() != ShaderType::COMPUTE)
+ return MFI->ScratchOffsetReg;
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
case SIRegisterInfo::SCRATCH_PTR:
return AMDGPU::SGPR2_SGPR3;
@@ -452,9 +468,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
/// \brief Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
-
- const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
+unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC) const {
for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
I != E; ++I) {
@@ -464,3 +479,29 @@ unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
return AMDGPU::NoRegister;
}
+unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
+ switch(WaveCount) {
+ case 10: return 24;
+ case 9: return 28;
+ case 8: return 32;
+ case 7: return 36;
+ case 6: return 40;
+ case 5: return 48;
+ case 4: return 64;
+ case 3: return 84;
+ case 2: return 128;
+ default: return 256;
+ }
+}
+
+unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
+ switch(WaveCount) {
+ case 10: return 48;
+ case 9: return 56;
+ case 8: return 64;
+ case 7: return 72;
+ case 6: return 80;
+ case 5: return 96;
+ default: return 103;
+ }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index c7e54db..d908ffd 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -17,6 +17,7 @@
#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
+#include "llvm/Support/Debug.h"
namespace llvm {
@@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
- unsigned getRegPressureLimit(const TargetRegisterClass *RC,
- MachineFunction &MF) const override;
+ unsigned getRegPressureSetLimit(unsigned Idx) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
@@ -42,7 +42,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getHWRegIndex(unsigned Reg) const override;
/// \brief Return the 'base' register class for this register.
- /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
+ /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
/// \returns true if this class contains only SGPR registers
@@ -80,22 +80,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
unsigned Channel) const;
- /// \returns True if operands defined with this register class can accept
+ /// \returns True if operands defined with this operand type can accept
/// a literal constant (i.e. any 32-bit immediate).
- bool regClassCanUseLiteralConstant(int RCID) const;
+ bool opCanUseLiteralConstant(unsigned OpType) const;
- /// \returns True if operands defined with this register class can accept
- /// a literal constant (i.e. any 32-bit immediate).
- bool regClassCanUseLiteralConstant(const TargetRegisterClass *RC) const;
-
- /// \returns True if operands defined with this register class can accept
+ /// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
/// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
- bool regClassCanUseInlineConstant(int RCID) const;
-
- /// \returns True if operands defined with this register class can accept
- /// a literal constant. i.e. A value in the range (-16, 64).
- bool regClassCanUseInlineConstant(const TargetRegisterClass *RC) const;
+ bool opCanUseInlineConstant(unsigned OpType) const;
enum PreloadedValue {
TGID_X,
@@ -113,7 +105,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;
- unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const;
+ /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
+ /// concurrent waves.
+ unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
+
+ /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
+ /// concurrent waves.
+ unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
+
+ unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC) const;
+
+private:
+ void buildScratchLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp, unsigned Value,
+ unsigned ScratchRsrcReg, unsigned ScratchOffset,
+ int64_t Offset, RegScavenger *RS) const;
};
} // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 45c2b41..8b25e95 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -21,7 +21,7 @@ def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 106;
@@ -36,14 +36,14 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
let HWEncoding = 126;
}
-def SCC : SIReg<"SCC", 253>;
-def M0 : SIReg <"M0", 124>;
+def SCC : SIReg<"scc", 253>;
+def M0 : SIReg <"m0", 124>;
def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
// Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 104;
@@ -184,9 +184,9 @@ def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
(add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
>;
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
(add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
>;
@@ -197,8 +197,6 @@ def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256
def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
-
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
@@ -211,31 +209,53 @@ def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+ let Size = 32;
+}
+
+class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_IMM32";
+}
+
+class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_INLINE_C";
+}
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
-def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
+def SSrc_32 : RegImmOperand<SReg_32>;
-def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+def SSrc_64 : RegImmOperand<SReg_64>;
+
+//===----------------------------------------------------------------------===//
+// SCSrc_* Operands with an SGPR or a inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_32 : RegInlineOperand<SReg_32>;
//===----------------------------------------------------------------------===//
// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
-def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
+def VSrc_32 : RegImmOperand<VS_32>;
-def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VSrc_64 : RegImmOperand<VS_64>;
//===----------------------------------------------------------------------===//
// VCSrc_* Operands with an SGPR, VGPR or an inline constant
//===----------------------------------------------------------------------===//
-def VCSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+def VCSrc_32 : RegInlineOperand<VS_32>;
-def VCSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VCSrc_64 : RegInlineOperand<VS_64>;
//===----------------------------------------------------------------------===//
// SGPR and VGPR register classes
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
index 28b65b8..9b1f676 100644
--- a/lib/Target/R600/SISchedule.td
+++ b/lib/Target/R600/SISchedule.td
@@ -7,9 +7,85 @@
//
//===----------------------------------------------------------------------===//
//
-// TODO: This is just a place holder for now.
+// MachineModel definitions for Southern Islands (SI)
//
//===----------------------------------------------------------------------===//
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS : SchedWrite;
+def WriteSALU : SchedWrite;
+def WriteSMEM : SchedWrite;
+def WriteVMEM : SchedWrite;
-def SI_Itin : ProcessorItineraries <[], [], []>;
+// Vector ALU instructions
+def Write32Bit : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA : SchedWrite;
+
+def WriteDouble : SchedWrite;
+def WriteDoubleAdd : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+def SIQuarterSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>; // Taken from S_WAITCNT
+def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT
+def HWSALU : ProcResource<1>;
+def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT
+def HWVALU : ProcResource<1>;
+
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+ int latency> : WriteRes<write, resources> {
+ let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+ HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide. They may not be acurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+ def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ???
+ def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ???
+ def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64
+ def : HWWriteRes<WriteSALU, [HWSALU], 1>;
+ def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
+ def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
+
+ def : HWVALUWriteRes<Write32Bit, 1>;
+ def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 4>;
+def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble, 16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+} // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp
index 45e83f5..97bbd78 100644
--- a/lib/Target/R600/SIShrinkInstructions.cpp
+++ b/lib/Target/R600/SIShrinkInstructions.cpp
@@ -10,6 +10,7 @@
//
#include "AMDGPU.h"
+#include "AMDGPUMCInstLower.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/Statistic.h"
@@ -126,37 +127,32 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
TII->isVOPC(MI.getOpcode()));
const SIRegisterInfo &TRI = TII->getRegisterInfo();
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
// Only one literal constant is allowed per instruction, so if src0 is a
// literal constant then we can't do any folding.
- if ((Src0->isImm() || Src0->isFPImm()) && TII->isLiteralConstant(*Src0))
+ if (Src0.isImm() &&
+ TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
return;
-
// Literal constants and SGPRs can only be used in Src0, so if Src0 is an
// SGPR, we cannot commute the instruction, so we can't fold any literal
// constants.
- if (Src0->isReg() && !isVGPR(Src0, TRI, MRI))
+ if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
return;
// Try to fold Src0
- if (Src0->isReg()) {
- unsigned Reg = Src0->getReg();
+ if (Src0.isReg()) {
+ unsigned Reg = Src0.getReg();
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
if (Def && Def->isMoveImmediate()) {
MachineOperand &MovSrc = Def->getOperand(1);
bool ConstantFolded = false;
if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
- Src0->ChangeToImmediate(MovSrc.getImm());
+ Src0.ChangeToImmediate(MovSrc.getImm());
ConstantFolded = true;
- } else if (MovSrc.isFPImm()) {
- const ConstantFP *CFP = MovSrc.getFPImm();
- if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle) {
- Src0->ChangeToFPImmediate(CFP);
- ConstantFolded = true;
- }
}
if (ConstantFolded) {
if (MRI.use_empty(Reg))
@@ -193,13 +189,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
const MachineOperand &Src = MI.getOperand(1);
- // TODO: Handle FPImm?
if (Src.isImm()) {
- if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) {
+ if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
- continue;
- }
}
+
+ continue;
}
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
@@ -213,13 +208,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
-
- // Op32 could be -1 here if we started with an instruction that had a
+ // getVOPe32 could be -1 here if we started with an instruction that had
// a 32-bit encoding and then commuted it to an instruction that did not.
- if (Op32 == -1)
+ if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
+ int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
if (TII->isVOPC(Op32)) {
unsigned DstReg = MI.getOperand(0).getReg();
if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 9318dc1..27bbf4f 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -61,8 +61,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
}
bool SITypeRewriter::runOnFunction(Function &F) {
- AttributeSet Set = F.getAttributes();
- Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+ Attribute A = F.getFnAttribute("ShaderType");
unsigned ShaderType = ShaderType::COMPUTE;
if (A.isStringAttribute()) {
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
index f437564..d723d6e 100644
--- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,11 +16,15 @@
using namespace llvm;
-/// \brief The target for the AMDGPU backend
+/// \brief The target which suports all AMD GPUs. This will eventually
+/// be deprecated and there will be a R600 target and a GCN target.
Target llvm::TheAMDGPUTarget;
+/// \brief The target for GCN GPUs
+Target llvm::TheGCNTarget;
/// \brief Extern function to initialize the targets for the AMDGPU backend
extern "C" void LLVMInitializeR600TargetInfo() {
RegisterTarget<Triple::r600, false>
R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+ RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs");
}
diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td
new file mode 100644
index 0000000..d8738f9
--- /dev/null
+++ b/lib/Target/R600/VIInstrFormats.td
@@ -0,0 +1,166 @@
+//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// VI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class DSe_vi <bits<8> op> : Enc64 {
+ bits<8> vdst;
+ bits<1> gds;
+ bits<8> addr;
+ bits<8> data0;
+ bits<8> data1;
+ bits<8> offset0;
+ bits<8> offset1;
+
+ let Inst{7-0} = offset0;
+ let Inst{15-8} = offset1;
+ let Inst{16} = gds;
+ let Inst{24-17} = op;
+ let Inst{31-26} = 0x36; //encoding
+ let Inst{39-32} = addr;
+ let Inst{47-40} = data0;
+ let Inst{55-48} = data1;
+ let Inst{63-56} = vdst;
+}
+
+class MUBUFe_vi <bits<7> op> : Enc64 {
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<1> lds;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{16} = lds;
+ let Inst{17} = slc;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
+}
+
+class MTBUFe_vi <bits<4> op> : Enc64 {
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{18-15} = op;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{54} = slc;
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
+}
+
+class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
+ bits<7> sbase;
+ bits<7> sdata;
+ bits<1> glc;
+ bits<20> offset;
+
+ let Inst{5-0} = sbase{6-1};
+ let Inst{12-6} = sdata;
+ let Inst{16} = glc;
+ let Inst{17} = imm;
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x30; //encoding
+ let Inst{51-32} = offset;
+}
+
+class VOP3e_vi <bits<10> op> : Enc64 {
+ bits<8> vdst;
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{7-0} = vdst;
+ let Inst{8} = src0_modifiers{1};
+ let Inst{9} = src1_modifiers{1};
+ let Inst{10} = src2_modifiers{1};
+ let Inst{15} = clamp;
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = src0;
+ let Inst{49-41} = src1;
+ let Inst{58-50} = src2;
+ let Inst{60-59} = omod;
+ let Inst{61} = src0_modifiers{0};
+ let Inst{62} = src1_modifiers{0};
+ let Inst{63} = src2_modifiers{0};
+}
+
+class VOP3be_vi <bits<10> op> : Enc64 {
+ bits<8> vdst;
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<7> sdst;
+ bits<2> omod;
+ bits<1> clamp;
+
+ let Inst{7-0} = vdst;
+ let Inst{14-8} = sdst;
+ let Inst{15} = clamp;
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = src0;
+ let Inst{49-41} = src1;
+ let Inst{58-50} = src2;
+ let Inst{60-59} = omod;
+ let Inst{61} = src0_modifiers{0};
+ let Inst{62} = src1_modifiers{0};
+ let Inst{63} = src2_modifiers{0};
+}
+
+class EXPe_vi : EXPe {
+ let Inst{31-26} = 0x31; //encoding
+}
+
+class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
+ let Inst{31-26} = 0x35; // encoding
+}
diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td
new file mode 100644
index 0000000..4a6e933
--- /dev/null
+++ b/lib/Target/R600/VIInstructions.td
@@ -0,0 +1,25 @@
+//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+} // End Predicates = [isVI]