diff options
Diffstat (limited to 'lib/Target/R600')
70 files changed, 6821 insertions, 3211 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 261075e..fb87cc5 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); @@ -46,6 +47,10 @@ FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIPrepareScratchRegs(); + +void initializeSIFoldOperandsPass(PassRegistry &); +extern char &SIFoldOperandsID; void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -59,19 +64,20 @@ Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); -/// \brief Creates an AMDGPU-specific Target Transformation Info pass. -ImmutablePass * -createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM); - void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; namespace AMDGPU { enum TargetIndex { - TI_CONSTDATA_START + TI_CONSTDATA_START, + TI_SCRATCH_RSRC_DWORD0, + TI_SCRATCH_RSRC_DWORD1, + TI_SCRATCH_RSRC_DWORD2, + TI_SCRATCH_RSRC_DWORD3 }; } diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 4cf1243..a7d48b3 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -48,6 +48,12 @@ def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", "Enable double precision denormal handling", [FeatureFP64]>; +def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add", + []>; + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. @@ -92,6 +98,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "true", "Support flat address space">; +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", @@ -147,10 +158,16 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace]>; + +def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace]>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; } def AMDGPUAsmParser : AsmParser { diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 5511d7c..92bc314 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -18,6 +18,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" +#include "AMDKernelCodeT.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" @@ -57,7 +58,7 @@ using namespace llvm; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(const MachineFunction &F) { - const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>(); // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = @@ -72,19 +73,20 @@ static uint32_t getFPMode(const MachineFunction &F) { FP_DENORM_MODE_DP(FP64Denormals); } -static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, - MCStreamer &Streamer) { - return new AMDGPUAsmPrinter(tm, Streamer); +static AsmPrinter * +createAMDGPUAsmPrinterPass(TargetMachine &tm, + std::unique_ptr<MCStreamer> &&Streamer) { + return new AMDGPUAsmPrinter(tm, std::move(Streamer)); } extern "C" void LLVMInitializeR600AsmPrinter() { TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); } -AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer) { - DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode(); -} +AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)) {} void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { @@ -106,14 +108,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitFunctionHeader(); MCContext &Context = getObjFileLowering().getContext(); - const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", - ELF::SHT_PROGBITS, 0, - SectionKind::getReadOnly()); + const MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); OutStreamer.SwitchSection(ConfigSection); - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; - if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { + if (STM.isAmdHsaOS()) { + getSIProgramInfo(KernelInfo, MF); + EmitAmdKernelCodeT(MF, KernelInfo); + OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1)); + } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); } else { @@ -128,10 +133,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitFunctionBody(); if (isVerbose()) { - const MCSectionELF *CommentSection - = Context.getELFSection(".AMDGPU.csdata", - ELF::SHT_PROGBITS, 0, - SectionKind::getReadOnly()); + const MCSectionELF *CommentSection = + Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); OutStreamer.SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { @@ -156,22 +159,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } if (STM.dumpCode()) { -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - MF.dump(); -#endif - if (DisasmEnabled) { - OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm", - ELF::SHT_NOTE, 0, - SectionKind::getReadOnly())); + OutStreamer.SwitchSection( + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); - for (size_t i = 0; i < DisasmLines.size(); ++i) { - std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); - Comment += " ; " + HexLines[i] + "\n"; + for (size_t i = 0; i < DisasmLines.size(); ++i) { + std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; - OutStreamer.EmitBytes(StringRef(DisasmLines[i])); - OutStreamer.EmitBytes(StringRef(Comment)); - } + OutStreamer.EmitBytes(StringRef(DisasmLines[i])); + OutStreamer.EmitBytes(StringRef(Comment)); } } @@ -181,10 +178,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const R600RegisterInfo *RI = + static_cast<const R600RegisterInfo *>(STM.getRegisterInfo()); const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -240,13 +237,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) const { + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; bool FlatUsed = false; - const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const SIRegisterInfo *RI = + static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -285,7 +284,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (AMDGPU::SReg_32RegClass.contains(reg)) { isSGPR = true; width = 1; - } else if (AMDGPU::VReg_32RegClass.contains(reg)) { + } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { @@ -340,6 +339,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; + ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); @@ -356,21 +357,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.FlatUsed = FlatUsed; ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; -} - -void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - unsigned RsrcReg; - switch (MFI->getShaderType()) { - default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; - case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; - case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break; - case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; - } unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { @@ -384,59 +370,203 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, unsigned LDSSpillSize = MFI->LDSWaveSpillSize * MFI->getMaximumWorkGroupSize(MF); - unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize + LDSSpillSize, - 1 << LDSAlignShift) >> LDSAlignShift; + ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSBlocks = + RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; // Scratch is allocated in 256 dword blocks. unsigned ScratchAlignShift = 10; // We need to program the hardware with the amount of scratch memory that - // is used by the entire wave. KernelInfo.ScratchSize is the amount of + // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - unsigned ScratchBlocks = - RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), + ProgInfo.ScratchBlocks = + RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), 1 << ScratchAlignShift) >> ScratchAlignShift; - unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4; - unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8; + ProgInfo.ComputePGMRSrc1 = + S_00B848_VGPRS(ProgInfo.VGPRBlocks) | + S_00B848_SGPRS(ProgInfo.SGPRBlocks) | + S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | + S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + + ProgInfo.ComputePGMRSrc2 = + S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | + S_00B84C_TGID_X_EN(1) | + S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); +} + +static unsigned getRsrcReg(unsigned ShaderType) { + switch (ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; + case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + } +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) { + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - const uint32_t ComputePGMRSrc1 = - S_00B848_VGPRS(VGPRBlocks) | - S_00B848_SGPRS(SGPRBlocks) | - S_00B848_PRIORITY(KernelInfo.Priority) | - S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | - S_00B848_PRIV(KernelInfo.Priv) | - S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | - S_00B848_IEEE_MODE(KernelInfo.DebugMode) | - S_00B848_IEEE_MODE(KernelInfo.IEEEMode); - - OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); + OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - const uint32_t ComputePGMRSrc2 = - S_00B84C_LDS_SIZE(LDSBlocks) | - S_00B02C_SCRATCH_EN(ScratchBlocks > 0); - - OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); + OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); + OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) | - S_00B028_SGPRS(SGPRBlocks), 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + if (STM.isVGPRSpillingEnabled(MFI)) { + OutStreamer.EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer.EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + } } if (MFI->getShaderType() == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); + OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); } } + +void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + amd_kernel_code_t header; + + memset(&header, 0, sizeof(header)); + + header.amd_code_version_major = AMD_CODE_VERSION_MAJOR; + header.amd_code_version_minor = AMD_CODE_VERSION_MINOR; + + header.struct_byte_size = sizeof(amd_kernel_code_t); + + header.target_chip = STM.getAmdKernelCodeChipID(); + + header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment()); + + header.compute_pgm_resource_registers = + KernelInfo.ComputePGMRSrc1 | + (KernelInfo.ComputePGMRSrc2 << 32); + + // Code Properties: + header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | + AMD_CODE_PROPERTY_IS_PTR64; + + if (KernelInfo.FlatUsed) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + if (KernelInfo.ScratchBlocks) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + + // MFI->ABIArgOffset is the number of bytes for the kernel arguments + // plus 36. 36 is the number of bytes reserved at the begining of the + // input buffer to store work-group size information. + // FIXME: We should be adding the size of the implicit arguments + // to this value. + header.kernarg_segment_byte_size = MFI->ABIArgOffset; + + header.wavefront_sgpr_count = KernelInfo.NumSGPR; + header.workitem_vgpr_count = KernelInfo.NumVGPR; + + // FIXME: What values do I put for these alignments + header.kernarg_segment_alignment = 0; + header.group_segment_alignment = 0; + header.private_segment_alignment = 0; + + header.code_type = 1; // HSA_EXT_CODE_KERNEL + + header.wavefront_size = STM.getWavefrontSize(); + + const MCSectionELF *VersionSection = + OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0); + OutStreamer.SwitchSection(VersionSection); + OutStreamer.EmitBytes(Twine("HSA Code Unit:" + + Twine(header.hsail_version_major) + "." + + Twine(header.hsail_version_minor) + ":" + + "AMD:" + + Twine(header.amd_code_version_major) + "." + + Twine(header.amd_code_version_minor) + ":" + + "GFX8.1:0").str()); + + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + + if (isVerbose()) { + OutStreamer.emitRawComment("amd_code_version_major = " + + Twine(header.amd_code_version_major), false); + OutStreamer.emitRawComment("amd_code_version_minor = " + + Twine(header.amd_code_version_minor), false); + OutStreamer.emitRawComment("struct_byte_size = " + + Twine(header.struct_byte_size), false); + OutStreamer.emitRawComment("target_chip = " + + Twine(header.target_chip), false); + OutStreamer.emitRawComment(" compute_pgm_rsrc1: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc1), false); + OutStreamer.emitRawComment(" compute_pgm_rsrc2: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc2), false); + OutStreamer.emitRawComment("enable_sgpr_private_segment_buffer = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); + OutStreamer.emitRawComment("enable_sgpr_kernarg_segment_ptr = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); + OutStreamer.emitRawComment("private_element_size = 2 ", false); + OutStreamer.emitRawComment("is_ptr64 = " + + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); + OutStreamer.emitRawComment("workitem_private_segment_byte_size = " + + Twine(header.workitem_private_segment_byte_size), + false); + OutStreamer.emitRawComment("workgroup_group_segment_byte_size = " + + Twine(header.workgroup_group_segment_byte_size), + false); + OutStreamer.emitRawComment("gds_segment_byte_size = " + + Twine(header.gds_segment_byte_size), false); + OutStreamer.emitRawComment("kernarg_segment_byte_size = " + + Twine(header.kernarg_segment_byte_size), false); + OutStreamer.emitRawComment("wavefront_sgpr_count = " + + Twine(header.wavefront_sgpr_count), false); + OutStreamer.emitRawComment("workitem_vgpr_count = " + + Twine(header.workitem_vgpr_count), false); + OutStreamer.emitRawComment("code_type = " + Twine(header.code_type), false); + OutStreamer.emitRawComment("wavefront_size = " + + Twine((int)header.wavefront_size), false); + OutStreamer.emitRawComment("optimization_level = " + + Twine(header.optimization_level), false); + OutStreamer.emitRawComment("hsail_profile = " + + Twine(header.hsail_profile), false); + OutStreamer.emitRawComment("hsail_machine_model = " + + Twine(header.hsail_machine_model), false); + OutStreamer.emitRawComment("hsail_version_major = " + + Twine(header.hsail_version_major), false); + OutStreamer.emitRawComment("hsail_version_minor = " + + Twine(header.hsail_version_minor), false); + } + + OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header))); +} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index b9a0767..58ffb1e 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - NumVGPR(0), - NumSGPR(0), + VGPRBlocks(0), + SGPRBlocks(0), Priority(0), FloatMode(0), Priv(0), @@ -33,13 +33,19 @@ private: DebugMode(0), IEEEMode(0), ScratchSize(0), + ComputePGMRSrc1(0), + LDSBlocks(0), + ScratchBlocks(0), + ComputePGMRSrc2(0), + NumVGPR(0), + NumSGPR(0), FlatUsed(false), VCCUsed(false), CodeLen(0) {} // Fields set in PGM_RSRC1 pm4 packet. - uint32_t NumVGPR; - uint32_t NumSGPR; + uint32_t VGPRBlocks; + uint32_t SGPRBlocks; uint32_t Priority; uint32_t FloatMode; uint32_t Priv; @@ -48,6 +54,17 @@ private: uint32_t IEEEMode; uint32_t ScratchSize; + uint64_t ComputePGMRSrc1; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks; + uint32_t ScratchBlocks; + + uint64_t ComputePGMRSrc2; + + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t LDSSize; bool FlatUsed; // Bonus information for debugging. @@ -64,9 +81,12 @@ private: /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const; public: - explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer); + explicit AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer); bool runOnMachineFunction(MachineFunction &MF) override; @@ -80,7 +100,6 @@ public: void EmitEndOfAsmFile(Module &M) override; protected: - bool DisasmEnabled; std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; }; diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 90b6672..b5ab703 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -39,11 +39,11 @@ namespace { class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. - const AMDGPUSubtarget &Subtarget; + const AMDGPUSubtarget *Subtarget; public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); - + bool runOnMachineFunction(MachineFunction &MF) override; SDNode *Select(SDNode *N) override; const char *getPassName() const override; void PostprocessISelDAG() override; @@ -95,9 +95,9 @@ private: SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &Offset) const; + SDValue &SOffset, SDValue &Offset) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &Offset, + SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; @@ -113,6 +113,9 @@ private: bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const; + bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const; SDNode *SelectADD_SUB_I64(SDNode *N); SDNode *SelectDIV_SCALE(SDNode *N); @@ -129,7 +132,11 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { } AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) - : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) { + : SelectionDAGISel(TM) {} + +bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget()); + return SelectionDAGISel::runOnMachineFunction(MF); } AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { @@ -153,7 +160,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { const MCInstrDesc &Desc = - TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode()); + Subtarget->getInstrInfo()->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; @@ -161,17 +168,17 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, if (RegClass == -1) return nullptr; - return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass); + return Subtarget->getRegisterInfo()->getRegClass(RegClass); } case AMDGPU::REG_SEQUENCE: { unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); const TargetRegisterClass *SuperRC = - TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID); + Subtarget->getRegisterInfo()->getRegClass(RCID); SDValue SubRegOp = N->getOperand(OpNo + 1); unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); - return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg( - SuperRC, SubRegIdx); + return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, + SubRegIdx); } } } @@ -241,7 +248,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return nullptr; // Already selected. } - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); switch (Opc) { default: break; // We are selecting i64 ADD here instead of custom lower it during @@ -250,7 +256,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::ADD: case ISD::SUB: { if (N->getValueType(0) != MVT::i64 || - ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; return SelectADD_SUB_I64(N); @@ -259,15 +265,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; - const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); - const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsEq(MVT::i32)); - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { bool UseVReg = true; for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { @@ -278,12 +281,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (!RC) { continue; } - if (SIRI->isSGPRClass(RC)) { + if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) { UseVReg = false; } } switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID : + case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : AMDGPU::SReg_32RegClassID; break; case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : @@ -365,7 +368,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } if (N->getValueType(0) == MVT::i128) { @@ -387,8 +390,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: case ISD::ConstantFP: { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) break; @@ -414,8 +416,55 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { N->getValueType(0), Ops); } + case ISD::LOAD: { + // To simplify the TableGen patters, we replace all i64 loads with + // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 + // during DAG legalization, however, so places (ExpandUnalignedLoad) + // in the DAG legalizer assume that if i64 is legal, so doing this + // promotion early can cause problems. + EVT VT = N->getValueType(0); + LoadSDNode *LD = cast<LoadSDNode>(N); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + break; + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); + SelectCode(NewLoad.getNode()); + N = BitCast.getNode(); + break; + } + + case ISD::STORE: { + // Handle i64 stores here for the same reason mentioned above for loads. + StoreSDNode *ST = cast<StoreSDNode>(N); + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + break; + + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::v2i32, Value); + SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, + ST->getBasePtr(), ST->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + + if (NewValue.getOpcode() == ISD::BITCAST) { + Select(NewStore.getNode()); + return SelectCode(NewValue.getNode()); + } + + // getNode() may fold the bitcast if its input was another bitcast. If that + // happens we should only select the new store. + N = NewStore.getNode(); + break; + } + case AMDGPUISD::REGISTER_LOAD: { - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; @@ -431,7 +480,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Ops); } case AMDGPUISD::REGISTER_STORE: { - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; SelectADDRIndirect(N->getOperand(2), Addr, Offset); @@ -449,7 +498,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { - if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; // There is a scalar version available, but unlike the vector version which @@ -554,13 +603,11 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { } bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { - if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getMemoryVT().bitsLT(MVT::i32)) { + if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getMemoryVT().bitsLT(MVT::i32)) return true; - } - } + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); } @@ -736,6 +783,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -745,30 +794,22 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); - const SDValue False = CurDAG->getTargetConstant(0, MVT::i1); - SDValue Ops[] = { - Zero, // src0_modifiers - N->getOperand(0), // src0 - Zero, // src1_modifiers - N->getOperand(1), // src1 - Zero, // src2_modifiers - N->getOperand(2), // src2 - False, // clamp - Zero // omod - }; + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[8]; + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); } bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const { - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if ((OffsetBits == 16 && !isUInt<16>(Offset)) || (OffsetBits == 8 && !isUInt<8>(Offset))) return false; - if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) return true; // On Southern Islands instruction with a negative base value and an offset @@ -879,26 +920,32 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (isLegalMUBUFImmOffset(C1)) { - - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, MVT::i1); - Ptr = N2; - VAddr = N3; - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); - return; - } + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, MVT::i1); + Ptr = N2; + VAddr = N3; + } else { // (add N0, C1) -> offset VAddr = CurDAG->getTargetConstant(0, MVT::i32); Ptr = N0; - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + } + + if (isLegalMUBUFImmOffset(C1)) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return; + } else if (isUInt<32>(C1->getZExtValue())) { + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, MVT::i16); + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i32)), 0); return; } } + if (Addr.getOpcode() == ISD::ADD) { // (add N0, N1) -> addr64 SDValue N0 = Addr.getOperand(0); @@ -918,9 +965,9 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, + SDValue &VAddr, SDValue &SOffset, SDValue &Offset) const { - SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE; + SDValue Ptr, Offen, Idxen, Addr64, GLC, SLC, TFE; SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -940,11 +987,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &Offset, - SDValue &SLC) const { + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, + SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, MVT::i1); - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, @@ -954,21 +1002,32 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); unsigned ScratchOffsetReg = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, ScratchOffsetReg, MVT::i32); + SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); + SDValue ScratchRsrcDword0 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); + + SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); + SDValue ScratchRsrcDword1 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); - SDValue ScratchPtr = - CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64); + const SDValue RsrcOps[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + ScratchRsrcDword0, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + ScratchRsrcDword1, + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), + }; + SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, RsrcOps), 0); Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); @@ -985,22 +1044,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, } } - // (add FI, n0) - if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && - isa<FrameIndexSDNode>(Addr.getOperand(0))) { - VAddr = Addr.getOperand(1); - ImmOffset = Addr.getOperand(0); - return true; - } - - // (FI) - if (isa<FrameIndexSDNode>(Addr)) { - VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, - CurDAG->getConstant(0, MVT::i32)), 0); - ImmOffset = Addr; - return true; - } - // (node) VAddr = Addr; ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); @@ -1012,6 +1055,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -1019,7 +1064,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, if (!cast<ConstantSDNode>(Offen)->getSExtValue() && !cast<ConstantSDNode>(Idxen)->getSExtValue() && !cast<ConstantSDNode>(Addr64)->getSExtValue()) { - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | APInt::getAllOnesValue(32).getZExtValue(); // Size SDLoc DL(Addr); @@ -1045,7 +1090,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); SDLoc DL(N); - assert(Subtarget.hasFlatAddressSpace() && + assert(Subtarget->hasFlatAddressSpace() && "addrspacecast only supported with flat address space!"); assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && @@ -1081,7 +1126,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { if (DestSize > SrcSize) { assert(SrcSize == 32 && DestSize == 64); - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32); + // FIXME: This is probably wrong, we should never be defining + // a register class with both VGPRs and SGPRs + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, MVT::i32); const SDValue Ops[] = { RC, @@ -1141,6 +1188,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const { + Clamp = Omod = CurDAG->getTargetConstant(0, MVT::i32); + return SelectVOP3Mods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 2f95b74..4707279 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -102,11 +102,9 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : - TargetLowering(TM) { - - Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); - +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { setOperationAction(ISD::Constant, MVT::i32, Legal); setOperationAction(ISD::Constant, MVT::i64, Legal); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); @@ -127,12 +125,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FABS, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -141,9 +148,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); - setOperationAction(ISD::STORE, MVT::i64, Promote); - AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); - setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -162,9 +166,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : // Custom lowering of vector stores is required for local address space // stores. setOperationAction(ISD::STORE, MVT::v4i32, Custom); - // XXX: Native v2i32 local address space stores are possible, but not - // currently implemented. - setOperationAction(ISD::STORE, MVT::v2i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); @@ -187,9 +188,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); - setOperationAction(ISD::LOAD, MVT::i64, Promote); - AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); - setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); @@ -216,18 +214,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -246,7 +254,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); @@ -382,6 +391,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); @@ -397,6 +412,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : // large sequence of instructions. setIntDivIsCheap(false); setPow2SDivIsCheap(false); + setFsqrtIsCheap(true); // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; @@ -429,6 +445,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); } +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, + ISD::LoadExtType, + EVT NewVT) const { + + unsigned NewSize = NewVT.getStoreSizeInBits(); + + // If we are reducing to a 32-bit load, this is always better. + if (NewSize == 32) + return true; + + EVT OldVT = N->getValueType(0); + unsigned OldSize = OldVT.getStoreSizeInBits(); + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (OldSize < 32); +} + bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) @@ -442,6 +481,18 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, (LScalarSize < 32)); } +// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also +// profitable with the expansion for 64-bit since it's generally good to +// speculate things. +// FIXME: These should really have the size as a parameter. +bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { + return true; +} + +bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { + return true; +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// @@ -560,6 +611,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); @@ -619,7 +671,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); + const DataLayout *TD = getDataLayout(); SDLoc DL(InitPtr); Type *InitTy = Init->getType(); @@ -707,7 +759,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); + const DataLayout *TD = getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); @@ -810,8 +862,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - getTargetMachine().getSubtargetImpl()->getFrameLowering()); + const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); @@ -866,10 +917,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::AMDGPU_div_fmas: - // FIXME: Dropping bool parameter. Work is needed to support the implicit - // read from VCC. return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); case Intrinsic::AMDGPU_div_fixup: return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, @@ -889,7 +939,19 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); case Intrinsic::AMDGPU_rsq_clamped: - return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, VT)); + } else { + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + } case Intrinsic::AMDGPU_ldexp: return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), @@ -962,6 +1024,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDGPU_brev: return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); + case Intrinsic::AMDGPU_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); @@ -1000,17 +1066,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, } /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return SDValue(); + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); + SelectionDAG &DAG = DCI.DAG; ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); switch (CCOpcode) { case ISD::SETOEQ: @@ -1027,27 +1097,47 @@ SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL, case ISD::SETO: break; case ISD::SETULE: - case ISD::SETULT: + case ISD::SETULT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + } case ISD::SETOLE: case ISD::SETOLT: case ISD::SETLE: case ISD::SETLT: { + // Ordered. Assume ordered for undefined. + + // Only do this after legalization to avoid interfering with other combines + // which might occur. + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + // We need to permute the operands to get the correct NaN behavior. The // selected operand is the second one based on the failing compare with NaN, // so permute it based on the compare type the hardware uses. if (LHS == True) - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); } case ISD::SETGT: case ISD::SETGE: - case ISD::SETUGE: case ISD::SETOGE: - case ISD::SETUGT: case ISD::SETOGT: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + if (LHS == True) - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); } case ISD::SETCC_INVALID: llvm_unreachable("Invalid setcc condcode!"); @@ -1330,24 +1420,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT MemVT = Load->getMemoryVT(); - if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { - // We can do the extload to 32-bits, and then need to separately extend to - // 64-bits. - - SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, - Load->getChain(), - Load->getBasePtr(), - MemVT, - Load->getMemOperand()); - - SDValue Ops[] = { - DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), - ExtLoad32.getValue(1) - }; - - return DAG.getMergeValues(Ops, DL); - } - if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { assert(VT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC @@ -1586,12 +1658,26 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + if (VT == MVT::i64 && + DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + + SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); + Results.push_back(DIV); + Results.push_back(REM); + return; + } + // Get Speculative values SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Hi = zero; SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); SDValue DIV_Lo = zero; @@ -1599,8 +1685,10 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, const unsigned halfBitWidth = HalfVT.getSizeInBits(); for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit + const unsigned bitPos = halfBitWidth - i - 1; + SDValue POS = DAG.getConstant(bitPos, HalfVT); + // Get value of high bit + // TODO: Remove the BFE part when the optimization is fixed SDValue HBit; if (halfBitWidth == 32 && Subtarget->hasBFE()) { HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); @@ -1608,33 +1696,23 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); } + HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); + // Shift + REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT)); + // Add LHS high bit + REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT); SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); // Update REM - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); } - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); Results.push_back(DIV); Results.push_back(REM); @@ -1655,8 +1733,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Den = Op.getOperand(1); if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && - DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { + if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && + DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { // TODO: We technically could do this for i64, but shouldn't that just be // handled by something generally reducing 64-bit division on 32-bit // values to 32-bit? @@ -1768,19 +1846,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - if (VT == MVT::i32) { - if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 && - DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, true); - } - } - SDValue Zero = DAG.getConstant(0, VT); SDValue NegOne = DAG.getConstant(-1, VT); + if (VT == MVT::i32 && + DAG.ComputeNumSignBits(LHS) > 8 && + DAG.ComputeNumSignBits(RHS) > 8) { + return LowerDIVREM24(Op, DAG, true); + } + if (VT == MVT::i64 && + DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + //HiLo split + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + SDValue Res[2] = { + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) + }; + return DAG.getMergeValues(Res, DL); + } + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); @@ -1845,6 +1935,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, MVT::i32), + DAG.getConstant(ExpBits, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, MVT::i32)); + + return Exp; +} + SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1860,16 +1964,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { // exponent. SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); - const unsigned FractBits = 52; - const unsigned ExpBits = 11; + SDValue Exp = extractF64Exponent(Hi, SL, DAG); - // Extract the exponent. - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - Hi, - DAG.getConstant(FractBits - 32, MVT::i32), - DAG.getConstant(ExpBits, MVT::i32)); - SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, - DAG.getConstant(1023, MVT::i32)); + const unsigned FractBits = 52; // Extract the sign bit. const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); @@ -1932,6 +2029,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); } +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, MVT::i32); + const SDValue One = DAG.getConstant(1, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, MVT::i32); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, MVT::f64), + DAG.getConstantFP(0.0, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -2155,7 +2345,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SDValue Value = SN->getValue(); EVT VT = Value.getValueType(); - if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + if (isTypeLegal(VT) || SN->isVolatile() || + !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) return SDValue(); LoadSDNode *LoadVal = cast<LoadSDNode>(Value); @@ -2231,27 +2422,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT_CC: { - SDLoc DL(N); - EVT VT = N->getValueType(0); - - if (VT == MVT::f32 || - (VT == MVT::f64 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue False = N->getOperand(3); - SDValue CC = N->getOperand(4); - - return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } - - break; - } case ISD::SELECT: { SDValue Cond = N->getOperand(0); - if (Cond.getOpcode() == ISD::SETCC) { + if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue LHS = Cond.getOperand(0); @@ -2261,11 +2434,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); - if (VT == MVT::f32 || - (VT == MVT::f64 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) { - return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } + if (VT == MVT::f32) + return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); // TODO: Implement min / max Evergreen instructions. if (VT == MVT::i32 && @@ -2451,7 +2621,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(MAD) NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) @@ -2474,6 +2643,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ_LEGACY) NODE_NAME_CASE(RSQ_CLAMPED) NODE_NAME_CASE(LDEXP) + NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) @@ -2505,6 +2675,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { } } +SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rsq instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + // Reciprocal, < 1 ulp error. + // + // This reciprocal approximation converges to < 0.5 ulp error with one + // newton rhapson performed with two fused multiple adds (FMAs). + + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rcp instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + static void computeKnownBitsForMinMax(const SDValue Op0, const SDValue Op1, APInt &KnownZero, diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 36b4ee6..6bc6ca5 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -43,12 +43,15 @@ private: /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. - SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -86,6 +89,7 @@ protected: SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const; @@ -106,7 +110,7 @@ protected: const SmallVectorImpl<ISD::InputArg> &Ins) const; public: - AMDGPUTargetLowering(TargetMachine &TM); + AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; @@ -124,8 +128,14 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool ShouldShrinkFPConstant(EVT VT) const override; + bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtType, + EVT ExtVT) const override; bool isLoadBitCastBeneficial(EVT, EVT) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, @@ -142,14 +152,14 @@ public: SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineFMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const; + SDValue CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const; SDValue CombineIMinMax(SDLoc DL, EVT VT, SDValue LHS, @@ -161,6 +171,14 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; + SDValue getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const { return N; @@ -200,7 +218,6 @@ enum { DWORDADDR, FRACT, CLAMP, - MAD, // Multiply + add with same result as the separate operations. // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. @@ -231,6 +248,7 @@ enum { RSQ_LEGACY, RSQ_CLAMPED, LDEXP, + FP_CLASS, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index a8fc614..f4de2d6 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -319,10 +319,7 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getFrameIndexOffset(MF, -1); + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } @@ -341,8 +338,39 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { // instead. namespace llvm { namespace AMDGPU { -int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcode(Opcode); +static int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); } } } + +// This must be kept in sync with the SISubtarget class in SIInstrInfo.td +enum SISubtarget { + SI = 0, + VI = 1 +}; + +static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { + switch (Gen) { + default: + return SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + return VI; + } +} + +int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { + int MCOp = AMDGPU::getMCOpcode(Opcode, + AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration())); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index da9833d..202183c 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -135,6 +135,17 @@ public: bool isRegisterStore(const MachineInstr &MI) const; bool isRegisterLoad(const MachineInstr &MI) const; + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + //===---------------------------------------------------------------------===// // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 4ee0f2b..901eb51 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -27,10 +27,19 @@ def AMDGPULdExpOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] >; +def AMDGPUFPClassOp : SDTypeProfile<1, 2, + [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; +// float, float, float, vcc +def AMDGPUFmasOp : SDTypeProfile<1, 4, + [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -58,16 +67,17 @@ def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; +def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; + // out = max(a, b) a and b are floats, where a nan comparison fails. // This is not commutative because this gives the second operand: // x < nan ? x : nan -> nan // nan < x ? nan : x -> x def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, - [SDNPAssociative] + [] >; def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; -def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>; // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, @@ -81,7 +91,7 @@ def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, // out = min(a, b) a and b are floats, where a nan comparison fails. def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, - [SDNPAssociative] + [] >; // out = min(a, b) a and b are signed ints @@ -147,7 +157,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; // Special case divide FMA with scale and flags (src0 = Quotient, // src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; // Single or double precision division fixup. // Special case divide fixup and flags(src0 = Quotient, src1 = diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index c215865..849b241 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -23,8 +23,6 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio let Pattern = pattern; let Itinerary = NullALU; - let isCodeGenOnly = 1; - let TSFlags{63} = isRegisterLoad; let TSFlags{62} = isRegisterStore; } @@ -73,6 +71,11 @@ def COND_OEQ : PatLeaf < [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] >; +def COND_ONE : PatLeaf < + (cond), + [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] +>; + def COND_OGT : PatLeaf < (cond), [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] @@ -93,23 +96,28 @@ def COND_OLE : PatLeaf < [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] >; -def COND_UNE : PatLeaf < - (cond), - [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] ->; def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; //===----------------------------------------------------------------------===// -// PatLeafs for unsigned comparisons +// PatLeafs for unsigned / unordered comparisons //===----------------------------------------------------------------------===// +def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; +def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; +// XXX - For some reason R600 version is preferring to use unordered +// for setne? +def COND_UNE_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] +>; + //===----------------------------------------------------------------------===// // PatLeafs for signed comparisons //===----------------------------------------------------------------------===// @@ -154,10 +162,6 @@ class PrivateStore <SDPatternOperator op> : PrivateMemOp < (ops node:$value, node:$ptr), (op node:$value, node:$ptr) >; -def extloadi8_private : PrivateLoad <extloadi8>; -def sextloadi8_private : PrivateLoad <sextloadi8>; -def extloadi16_private : PrivateLoad <extloadi16>; -def sextloadi16_private : PrivateLoad <sextloadi16>; def load_private : PrivateLoad <load>; def truncstorei8_private : PrivateStore <truncstorei8>; @@ -221,6 +225,9 @@ def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def extloadi8_private : PrivateLoad <az_extloadi8>; +def sextloadi8_private : PrivateLoad <sextloadi8>; + def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; }]>; @@ -257,6 +264,9 @@ def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def extloadi16_private : PrivateLoad <az_extloadi16>; +def sextloadi16_private : PrivateLoad <sextloadi16>; + def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; }]>; @@ -403,11 +413,6 @@ def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; // Misc Pattern Fragments //===----------------------------------------------------------------------===// -def fmad : PatFrag < - (ops node:$src0, node:$src1, node:$src2), - (fadd (fmul node:$src0, node:$src1), node:$src2) ->; - class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; @@ -428,6 +433,11 @@ def FP_ONE : PatLeaf < [{return N->isExactlyValue(1.0);}] >; +def FP_HALF : PatLeaf < + (fpimm), + [{return N->isExactlyValue(0.5);}] +>; + let isCodeGenOnly = 1, isPseudo = 1 in { let usesCustomInserter = 1 in { @@ -575,7 +585,7 @@ applied. def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>; def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}], - SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>; + SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()), MVT::i32);}]>>; class BFEPattern <Instruction BFE> : Pat < (and (srl i32:$x, legalshift32:$y), bfemask:$z), @@ -593,6 +603,20 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat < // 24-bit arithmetic patterns def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; +// Special conversion patterns + +def cvt_rpi_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor (fadd $src, FP_HALF))), + [{ (void) N; return TM.Options.NoNaNsFPMath; }] +>; + +def cvt_flr_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor $src)), + [{ (void)N; return TM.Options.NoNaNsFPMath; }] +>; + /* class UMUL24Pattern <Instruction UMUL24> : Pat < (mul U24:$x, U24:$y), @@ -639,17 +663,10 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat < (RcpInst $src) >; -multiclass RsqPat<Instruction RsqInst, ValueType vt> { - def : Pat < - (fdiv FP_ONE, (fsqrt vt:$src)), - (RsqInst $src) - >; - - def : Pat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) - >; -} +class RsqPat<Instruction RsqInst, ValueType vt> : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; include "R600Instructions.td" include "R700Instructions.td" diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index bca027f..f047ed0 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -39,37 +40,23 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): Ctx(ctx), ST(st) { } -enum AMDGPUMCInstLower::SISubtarget -AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const { - return AMDGPUMCInstLower::SI; -} - -unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const { +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { - int MCOpcode = AMDGPU::getMCOpcode(MIOpcode, - AMDGPUSubtargetToSISubtarget(ST.getGeneration())); - if (MCOpcode == -1) - MCOpcode = MIOpcode; + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); - return MCOpcode; -} - -void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + if (MCOpcode == -1) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " + "a target-specific version: " + Twine(MI->getOpcode())); + } - OutMI.setOpcode(getMCOpcode(MI->getOpcode())); + OutMI.setOpcode(MCOpcode); for (const MachineOperand &MO : MI->explicit_operands()) { MCOperand MCOp; switch (MO.getType()) { default: llvm_unreachable("unknown operand type"); - case MachineOperand::MO_FPImmediate: { - const APFloat &FloatValue = MO.getFPImm()->getValueAPF(); - assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle && - "Only floating point immediates are supported at the moment."); - MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat()); - break; - } case MachineOperand::MO_Immediate: MCOp = MCOperand::CreateImm(MO.getImm()); break; @@ -93,18 +80,24 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::CreateExpr(Expr); break; } + case MachineOperand::MO_ExternalSymbol: { + MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(MO.getSymbolName())); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); + MCOp = MCOperand::CreateExpr(Expr); + break; + } } OutMI.addOperand(MCOp); } } void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { - AMDGPUMCInstLower MCInstLowering(OutContext, - MF->getTarget().getSubtarget<AMDGPUSubtarget>()); + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + AMDGPUMCInstLower MCInstLowering(OutContext, STI); #ifdef _DEBUG StringRef Err; - if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) { + if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { errs() << "Warning: Illegal instruction detected: " << Err << "\n"; MI->dump(); } @@ -122,15 +115,15 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInstLowering.lower(MI, TmpInst); EmitToStreamer(OutStreamer, TmpInst); - if (DisasmEnabled) { + if (STI.dumpCode()) { // Disassemble instruction/operands to text. DisasmLines.resize(DisasmLines.size() + 1); std::string &DisasmLine = DisasmLines.back(); raw_string_ostream DisasmStream(DisasmLine); AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *TM.getSubtargetImpl()->getInstrInfo(), - *TM.getSubtargetImpl()->getRegisterInfo()); + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo()); InstPrinter.printInst(&TmpInst, DisasmStream, StringRef()); // Disassemble instruction/operands to hex representation. @@ -141,7 +134,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer; MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups, - TM.getSubtarget<MCSubtargetInfo>()); + MF->getSubtarget<MCSubtargetInfo>()); CodeStream.flush(); HexLines.resize(HexLines.size() + 1); diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 00d1f1b..d322fe0 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -19,22 +19,9 @@ class MCContext; class MCInst; class AMDGPUMCInstLower { - - // This must be kept in sync with the SISubtarget class in SIInstrInfo.td - enum SISubtarget { - SI = 0 - }; - MCContext &Ctx; const AMDGPUSubtarget &ST; - /// Convert a member of the AMDGPUSubtarget::Generation enum to the - /// SISubtarget enum. - enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const; - - /// Get the MC opcode for this MachineInstr. - unsigned getMCOpcode(unsigned MIOpcode) const; - public: AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 0f3f9e2..21c7da6 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -15,9 +15,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : LDSSize(0), ScratchSize(0), IsKernel(true) { - AttributeSet Set = MF.getFunction()->getAttributes(); - Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, - ShaderTypeAttribute); + Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); if (A.isStringAttribute()) { StringRef Str = A.getValueAsString(); diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 3433280..57b054b 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -42,8 +42,7 @@ void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - assert(!"Subroutines not supported yet"); - return 0; + return AMDGPU::NoRegister; } unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 9d09a19..70c8525 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -16,11 +16,11 @@ #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" -#include "SIInstrInfo.h" #include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" - -#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; @@ -31,22 +31,9 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -static std::string computeDataLayout(const AMDGPUSubtarget &ST) { - std::string Ret = "e-p:32:32"; - - if (ST.is64bit()) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; - } - - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; - - return Ret; -} - AMDGPUSubtarget & -AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) { +AMDGPUSubtarget::initializeSubtargetDependencies(StringRef TT, StringRef GPU, + StringRef FS) { // Determine default and user-specified characteristics // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be // enabled, but some instructions do not respect them and they run at the @@ -59,6 +46,9 @@ AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) { SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); FullFS += FS; + if (GPU == "" && Triple(TT).getArch() == Triple::amdgcn) + GPU = "SI"; + ParseSubtargetFeatures(GPU, FullFS); // FIXME: I don't think think Evergreen has any useful support for @@ -76,21 +66,24 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), CaymanISA(false), - FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), - EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), + FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), + CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), + WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + EnableVGPRSpilling(false), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), - InstrItins(getInstrItineraryForCPU(GPU)) { + InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { + + initializeSubtargetDependencies(TT, GPU, FS); + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); - TLInfo.reset(new R600TargetLowering(TM)); + TLInfo.reset(new R600TargetLowering(TM, *this)); } else { InstrInfo.reset(new SIInstrInfo(*this)); - TLInfo.reset(new SITargetLowering(TM)); + TLInfo.reset(new SITargetLowering(TM, *this)); } } @@ -107,3 +100,33 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const { llvm_unreachable("Illegal wavefront size."); } } + +unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { + switch(getGeneration()) { + default: llvm_unreachable("ChipID unknown"); + case SEA_ISLANDS: return 12; + } +} + +bool AMDGPUSubtarget::isVGPRSpillingEnabled( + const SIMachineFunctionInfo *MFI) const { + return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +} + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } +} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index f71d80a..1b0122c 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -20,7 +20,6 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "R600ISelLowering.h" -#include "llvm/IR/DataLayout.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -30,6 +29,8 @@ namespace llvm { +class SIMachineFunctionInfo; + class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { public: @@ -39,7 +40,8 @@ public: EVERGREEN, NORTHERN_ISLANDS, SOUTHERN_ISLANDS, - SEA_ISLANDS + SEA_ISLANDS, + VOLCANIC_ISLANDS, }; private: @@ -53,6 +55,7 @@ private: bool FP64; bool FP64Denormals; bool FP32Denormals; + bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; bool EnableIRStructurizer; @@ -62,16 +65,18 @@ private: unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; + bool EnableVGPRSpilling; - const DataLayout DL; AMDGPUFrameLowering FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; + Triple TargetTriple; public: AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM); - AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS); + AMDGPUSubtarget &initializeSubtargetDependencies(StringRef TT, StringRef GPU, + StringRef FS); const AMDGPUFrameLowering *getFrameLowering() const override { return &FrameLowering; @@ -85,7 +90,6 @@ public: AMDGPUTargetLowering *getTargetLowering() const override { return TLInfo.get(); } - const DataLayout *getDataLayout() const override { return &DL; } const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } @@ -124,6 +128,10 @@ public: return FP64Denormals; } + bool hasFastFMAF32() const { + return FastFMAF32; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } @@ -198,10 +206,16 @@ public: return LocalMemorySize; } + unsigned getAmdKernelCodeChipID() const; + bool enableMachineScheduler() const override { - return getGeneration() <= NORTHERN_ISLANDS; + return true; } + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + // Helper functions to simplify if statements bool isTargetELF() const { return false; @@ -217,6 +231,22 @@ public: bool r600ALUEncoding() const { return R600ALUInst; } + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; + } + bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + llvm_unreachable("do not know max waves per CU for this subtarget."); + } + + bool enableSubRegLiveness() const override { + return false; + } }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index b2cd988..a862f3c 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -15,6 +15,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" @@ -27,7 +28,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" @@ -38,7 +39,8 @@ using namespace llvm; extern "C" void LLVMInitializeR600Target() { // Register the target - RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget); + RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); + RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { @@ -49,12 +51,28 @@ static MachineSchedRegistry SchedCustomRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); +static std::string computeDataLayout(StringRef TT) { + Triple Triple(TT); + std::string Ret = "e-p:32:32"; + + if (Triple.getArch() == Triple::amdgcn) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), + DL(computeDataLayout(TT)), TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { setRequiresStructuredCFG(true); @@ -65,10 +83,33 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() { delete TLOF; } +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +R600TargetMachine::R600TargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : + AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } + + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : + AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } + +//===----------------------------------------------------------------------===// +// AMDGPU Pass Setup +//===----------------------------------------------------------------------===// + namespace { class AMDGPUPassConfig : public TargetPassConfig { public: - AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) + AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} AMDGPUTargetMachine &getAMDGPUTargetMachine() const { @@ -85,29 +126,38 @@ public: void addIRPasses() override; void addCodeGenPrepare() override; + virtual bool addPreISel() override; + virtual bool addInstSelector() override; +}; + +class R600PassConfig : public AMDGPUPassConfig { +public: + R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + bool addPreISel() override; - bool addInstSelector() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreSched2() override; - bool addPreEmitPass() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; }; -} // End of anonymous namespace -TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { - return new AMDGPUPassConfig(this, PM); -} +class GCNPassConfig : public AMDGPUPassConfig { +public: + GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + bool addPreISel() override; + bool addInstSelector() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; -//===----------------------------------------------------------------------===// -// AMDGPU Analysis Pass Setup -//===----------------------------------------------------------------------===// +} // End of anonymous namespace -void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { - // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This - // allows the AMDGPU pass to delegate to the target independent layer when - // appropriate. - PM.add(createBasicTargetTransformInfoPass(this)); - PM.add(createAMDGPUTargetTransformInfoPass(this)); +TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis( + [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); } void AMDGPUPassConfig::addIRPasses() { @@ -129,7 +179,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(createAMDGPUPromoteAlloca(ST)); addPass(createSROAPass()); } - TargetPassConfig::addCodeGenPrepare(); } @@ -139,84 +188,96 @@ AMDGPUPassConfig::addPreISel() { addPass(createFlattenCFGPass()); if (ST.IsIRStructurizerEnabled()) addPass(createStructurizeCFGPass()); - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - addPass(createSinkingPass()); - addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); - } else { - addPass(createR600TextureIntrinsicsReplacer()); - } return false; } bool AMDGPUPassConfig::addInstSelector() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + return false; +} - if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); - } +//===----------------------------------------------------------------------===// +// R600 Pass Setup +//===----------------------------------------------------------------------===// +bool R600PassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createR600TextureIntrinsicsReplacer()); return false; } -bool AMDGPUPassConfig::addPreRegAlloc() { +void R600PassConfig::addPreRegAlloc() { + addPass(createR600VectorRegMerger(*TM)); +} + +void R600PassConfig::addPreSched2() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createR600EmitClauseMarkers(), false); + if (ST.isIfCvtEnabled()) + addPass(&IfConverterID, false); + addPass(createR600ClauseMergePass(*TM), false); +} - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createR600VectorRegMerger(*TM)); - } else { - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. - - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - } - - addPass(createSIShrinkInstructionsPass()); - addPass(createSIFixSGPRLiveRangesPass()); - } - return false; +void R600PassConfig::addPreEmitPass() { + addPass(createAMDGPUCFGStructurizerPass(), false); + addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(&FinalizeMachineBundlesID, false); + addPass(createR600Packetizer(*TM), false); + addPass(createR600ControlFlowFinalizer(*TM), false); } -bool AMDGPUPassConfig::addPostRegAlloc() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); +TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { + return new R600PassConfig(this, PM); +} - addPass(createSIShrinkInstructionsPass()); - if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createSIInsertWaits(*TM)); - } +//===----------------------------------------------------------------------===// +// GCN Pass Setup +//===----------------------------------------------------------------------===// + +bool GCNPassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createSinkingPass()); + addPass(createSITypeRewriter()); + addPass(createSIAnnotateControlFlowPass()); return false; } -bool AMDGPUPassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - addPass(createR600EmitClauseMarkers()); - if (ST.isIfCvtEnabled()) - addPass(&IfConverterID); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - addPass(createR600ClauseMergePass(*TM)); +bool GCNPassConfig::addInstSelector() { + AMDGPUPassConfig::addInstSelector(); + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(createSIFoldOperandsPass()); return false; } -bool AMDGPUPassConfig::addPreEmitPass() { +void GCNPassConfig::addPreRegAlloc() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createAMDGPUCFGStructurizerPass()); - addPass(createR600ExpandSpecialInstrsPass(*TM)); - addPass(&FinalizeMachineBundlesID); - addPass(createR600Packetizer(*TM)); - addPass(createR600ControlFlowFinalizer(*TM)); - } else { - addPass(createSILowerControlFlowPass(*TM)); + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); } + addPass(createSIShrinkInstructionsPass(), false); + addPass(createSIFixSGPRLiveRangesPass(), false); +} - return false; +void GCNPassConfig::addPostRegAlloc() { + addPass(createSIPrepareScratchRegs(), false); + addPass(createSIShrinkInstructionsPass(), false); +} + +void GCNPassConfig::addPreSched2() { + addPass(createSIInsertWaits(*TM), false); +} + +void GCNPassConfig::addPreEmitPass() { + addPass(createSILowerControlFlowPass(*TM), false); +} + +TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { + return new GCNPassConfig(this, PM); } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 1b3dbce..a691536 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -24,7 +24,15 @@ namespace llvm { +//===----------------------------------------------------------------------===// +// AMDGPU Target Machine (R600+) +//===----------------------------------------------------------------------===// + class AMDGPUTargetMachine : public LLVMTargetMachine { +private: + const DataLayout DL; + +protected: TargetLoweringObjectFile *TLOF; AMDGPUSubtarget Subtarget; AMDGPUIntrinsicInfo IntrinsicInfo; @@ -34,21 +42,52 @@ public: StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); + // FIXME: This is currently broken, the DataLayout needs to move to + // the target machine. + const DataLayout *getDataLayout() const override { + return &DL; + } const AMDGPUSubtarget *getSubtargetImpl() const override { return &Subtarget; } const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetIRAnalysis getTargetIRAnalysis() override; - /// \brief Register R600 analysis passes with a pass manager. - void addAnalysisPasses(PassManagerBase &PM) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF; } }; +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +class R600TargetMachine : public AMDGPUTargetMachine { + +public: + R600TargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +class GCNTargetMachine : public AMDGPUTargetMachine { + +public: + GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + } // End namespace llvm #endif diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index e7bc006..68f4600 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -15,11 +15,11 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" +#include "AMDGPUTargetTransformInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -27,80 +27,10 @@ using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" -// Declare the pass initialization routine locally as target-specific passes -// don't have a target-wide initialization entry point, and so we rely on the -// pass constructor initialization. -namespace llvm { -void initializeAMDGPUTTIPass(PassRegistry &); -} - -namespace { - -class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo { - const AMDGPUTargetMachine *TM; - const AMDGPUSubtarget *ST; - const AMDGPUTargetLowering *TLI; - - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; - -public: - AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { - llvm_unreachable("This pass cannot be directly constructed"); - } - - AMDGPUTTI(const AMDGPUTargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getSubtargetImpl()->getTargetLowering()) { - initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); - } - - void initializePass() override { pushTTIStack(this); } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - TargetTransformInfo::getAnalysisUsage(AU); - } - - /// Pass identification. - static char ID; - - /// Provide necessary pointer adjustments for the two base classes. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &TargetTransformInfo::ID) - return (TargetTransformInfo *)this; - return this; - } - - bool hasBranchDivergence() const override; - - void getUnrollingPreferences(const Function *F, Loop *L, - UnrollingPreferences &UP) const override; - - PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override; - - unsigned getNumberOfRegisters(bool Vector) const override; - unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaxInterleaveFactor() const override; -}; - -} // end anonymous namespace - -INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti", - "AMDGPU Target Transform Info", true, true, false) -char AMDGPUTTI::ID = 0; - -ImmutablePass * -llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { - return new AMDGPUTTI(TM); -} - -bool AMDGPUTTI::hasBranchDivergence() const { return true; } - -void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, - UnrollingPreferences &UP) const { +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. - UP.Count = UINT_MAX; + UP.MaxCount = UINT_MAX; UP.Partial = true; // TODO: Do we want runtime unrolling? @@ -130,13 +60,7 @@ void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, } } -AMDGPUTTI::PopcntSupportKind -AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const { - assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software; -} - -unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { if (Vec) return 0; @@ -147,11 +71,9 @@ unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned AMDGPUTTI::getRegisterBitWidth(bool) const { - return 32; -} +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } -unsigned AMDGPUTTI::getMaxInterleaveFactor() const { +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor() { // Semi-arbitrary large amount. return 64; } diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.h b/lib/Target/R600/AMDGPUTargetTransformInfo.h new file mode 100644 index 0000000..4abbdf2 --- /dev/null +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.h @@ -0,0 +1,78 @@ +//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AMDGPU target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { + typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AMDGPUSubtarget *ST; + const AMDGPUTargetLowering *TLI; + + const AMDGPUSubtarget *getST() const { return ST; } + const AMDGPUTargetLowering *getTLI() const { return TLI; } + +public: + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) + : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { + BaseT::operator=(static_cast<const BaseT &>(RHS)); + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + bool hasBranchDivergence() { return true; } + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + } + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h new file mode 100644 index 0000000..4d3041f --- /dev/null +++ b/lib/Target/R600/AMDKernelCodeT.h @@ -0,0 +1,704 @@ +//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeT.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODET_H +#define AMDKERNELCODET_H + +#include <cstddef> +#include <cstdint> + +//---------------------------------------------------------------------------// +// AMD Kernel Code, and its dependencies // +//---------------------------------------------------------------------------// + +typedef uint8_t hsa_powertwo8_t; +typedef uint32_t hsa_ext_code_kind_t; +typedef uint8_t hsa_ext_brig_profile8_t; +typedef uint8_t hsa_ext_brig_machine_model8_t; +typedef uint64_t hsa_ext_control_directive_present64_t; +typedef uint16_t hsa_ext_exception_kind16_t; +typedef uint32_t hsa_ext_code_kind32_t; + +typedef struct hsa_dim3_s { + uint32_t x; + uint32_t y; + uint32_t z; +} hsa_dim3_t; + +/// The version of the amd_*_code_t struct. Minor versions must be +/// backward compatible. +typedef uint32_t amd_code_version32_t; +enum amd_code_version_t { + AMD_CODE_VERSION_MAJOR = 0, + AMD_CODE_VERSION_MINOR = 1 +}; + +/// The values used to define the number of bytes to use for the +/// swizzle element size. +enum amd_element_byte_size_t { + AMD_ELEMENT_2_BYTES = 0, + AMD_ELEMENT_4_BYTES = 1, + AMD_ELEMENT_8_BYTES = 2, + AMD_ELEMENT_16_BYTES = 3 +}; + +/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and +/// COMPUTE_PGM_RSRC2 registers. +typedef uint64_t amd_compute_pgm_resource_register64_t; + +/// Every amd_*_code_t has the following properties, which are composed of +/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), +/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount +/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. +/// +/// (Note that bit fields cannot be used as their layout is +/// implementation defined in the C standard and so cannot be used to +/// specify an ABI) +typedef uint32_t amd_code_property32_t; +enum amd_code_property_mask_t { + + /// Enable the setup of the SGPR user data registers + /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t + /// for initial register state. + /// + /// The total number of SGPRuser data registers requested must not + /// exceed 16. Any requests beyond 16 will be ignored. + /// + /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of + /// SGPR user data registers enabled up to 16). + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + + /// Control wave ID base counter for GDS ordered-append. Used to set + /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if + /// ORDERED_APPEND_MODE also needs to be settable) + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, + + /// The interleave (swizzle) element size in bytes required by the + /// code for private memory. This must be 2, 4, 8 or 16. This value + /// is provided to the finalizer when it is invoked and is recorded + /// here. The hardware will interleave the memory requests of each + /// lane of a wavefront by this element size to ensure each + /// work-item gets a distinct memory memory location. Therefore, the + /// finalizer ensures that all load and store operations done to + /// private memory do not exceed this size. For example, if the + /// element size is 4 (32-bits or dword) and a 64-bit value must be + /// loaded, the finalizer will generate two 32-bit loads. This + /// ensures that the interleaving will get the the work-item + /// specific dword for both halves of the 64-bit value. If it just + /// did a 64-bit load then it would get one dword which belonged to + /// its own work-item, but the second dword would belong to the + /// adjacent lane work-item since the interleaving is in dwords. + /// + /// The value used must match the value that the runtime configures + /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This + /// is generally DWORD. + /// + /// Use values from the amd_element_byte_size_t enum. + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, + + /// Are global memory addresses 64 bits. Must match + /// amd_kernel_code_t.hsail_machine_model == + /// HSA_MACHINE_LARGE. Must also match + /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), + /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, + AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, + + /// Indicate if the generated ISA is using a dynamically sized call + /// stack. This can happen if calls are implemented using a call + /// stack and recursion, alloca or calls to indirect functions are + /// present. In these cases the Finalizer cannot compute the total + /// private segment size at compile time. In this case the + /// workitem_private_segment_byte_size only specifies the statically + /// know private segment size, and additional space must be added + /// for the call stack. + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, + + /// Indicate if code generated has support for debugging. + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT +}; + +/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL +/// control directives. These control how the finalizer generates code. This +/// struct is used both as an argument to hsaFinalizeKernel to specify values for +/// the control directives, and is used in HsaKernelCode to record the values of +/// the control directives that the finalize used when generating the code which +/// either came from the finalizer argument or explicit HSAIL control +/// directives. See the definition of the control directives in HSA Programmer's +/// Reference Manual which also defines how the values specified as finalizer +/// arguments have to agree with the control directives in the HSAIL code. +typedef struct hsa_ext_control_directives_s { + /// This is a bit set indicating which control directives have been + /// specified. If the value is 0 then there are no control directives specified + /// and the rest of the fields can be ignored. The bits are accessed using the + /// hsa_ext_control_directives_present_mask_t. Any control directive that is not + /// enabled in this bit set must have the value of all 0s. + hsa_ext_control_directive_present64_t enabled_control_directives; + + /// If enableBreakExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. If the kernel being finalized + /// has any enablebreakexceptions control directives, then the values specified + /// by this argument are unioned with the values in these control + /// directives. If any of the functions the kernel calls have an + /// enablebreakexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_break_exceptions; + + /// If enableDetectExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. However, an implementation + /// should endeavour to make the performance impact small. If the kernel being + /// finalized has any enabledetectexceptions control directives, then the + /// values specified by this argument are unioned with the values in these + /// control directives. If any of the functions the kernel calls have an + /// enabledetectexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_detect_exceptions; + + /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of + /// dynamic group segment can be allocated for a dispatch, otherwise the value + /// specifies the maximum number of bytes of dynamic group segment that can be + /// allocated for a dispatch. If the kernel being finalized has any + /// maxdynamicsize control directives, then the values must be the same, and + /// must be the same as this argument if it is enabled. This value can be used + /// by the finalizer to determine the maximum number of bytes of group memory + /// used by each work-group by adding this value to the group memory required + /// for all group segment variables used by the kernel and all functions it + /// calls, and group memory used to implement other HSAIL features such as + /// fbarriers and the detect exception operations. This can allow the finalizer + /// to determine the expected number of work-groups that can be executed by a + /// compute unit and allow more resources to be allocated to the work-items if + /// it is known that fewer work-groups can be executed due to group memory + /// limitations. + uint32_t max_dynamic_group_size; + + /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater + /// than 0. See HSA Programmer's Reference Manual description of + /// maxflatgridsize control directive. + uint32_t max_flat_grid_size; + + /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be + /// greater than 0. See HSA Programmer's Reference Manual description of + /// maxflatworkgroupsize control directive. + uint32_t max_flat_workgroup_size; + + /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the + /// finalizer is free to generate ISA that may result in any number of + /// work-groups executing on a single compute unit. Otherwise, the finalizer + /// should attempt to generate ISA that will allow the specified number of + /// work-groups to execute on a single compute unit. This is only a hint and + /// can be ignored by the finalizer. If the kernel being finalized, or any of + /// the functions it calls, has a requested control directive, then the values + /// must be the same. This can be used to determine the number of resources + /// that should be allocated to a single work-group and work-item. For example, + /// a low value may allow more resources to be allocated, resulting in higher + /// per work-item performance, as it is known there will never be more than the + /// specified number of work-groups actually executing on the compute + /// unit. Conversely, a high value may allocate fewer resources, resulting in + /// lower per work-item performance, which is offset by the fact it allows more + /// work-groups to actually execute on the compute unit. + uint32_t requested_workgroups_per_cu; + + /// If not enabled then all elements for Dim3 must be 0, otherwise every + /// element must be greater than 0. See HSA Programmer's Reference Manual + /// description of requiredgridsize control directive. + hsa_dim3_t required_grid_size; + + /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be + /// 0, and the produced code can be dispatched with any legal work-group range + /// consistent with the dispatch dimensions. Otherwise, the code produced must + /// always be dispatched with the specified work-group range. No element of the + /// specified range must be 0. It must be consistent with required_dimensions + /// and max_flat_workgroup_size. If the kernel being finalized, or any of the + /// functions it calls, has a requiredworkgroupsize control directive, then the + /// values must be the same. Specifying a value can allow the finalizer to + /// optimize work-group id operations, and if the number of work-items in the + /// work-group is less than the WAVESIZE then barrier operations can be + /// optimized to just a memory fence. + hsa_dim3_t required_workgroup_size; + + /// If requiredDim is not enabled then must be 0 and the produced kernel code + /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is + /// 1..3 and the code produced must only be dispatched with a dimension that + /// matches. Other values are illegal. If the kernel being finalized, or any of + /// the functions it calls, has a requireddimsize control directive, then the + /// values must be the same. This can be used to optimize the code generated to + /// compute the absolute and flat work-group and work-item id, and the dim + /// HSAIL operations. + uint8_t required_dim; + + /// Reserved. Must be 0. + uint8_t reserved[75]; +} hsa_ext_control_directives_t; + +/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel +/// Code Object to set up the hardware to execute the kernel dispatch. +/// +/// Initial Kernel Register State. +/// +/// Initial kernel register state will be set up by CP/SPI prior to the start +/// of execution of every wavefront. This is limited by the constraints of the +/// current hardware. +/// +/// The order of the SGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enable_sgpr_* bit fields. The register numbers used for enabled registers +/// are dense starting at SGPR0: the first enabled register is SGPR0, the next +/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR +/// number. +/// +/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and +/// apply to all waves of the grid. It is possible to specify more than 16 User +/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 +/// are actually initialized. These are then immediately followed by the System +/// SGPRs that are set up by ADC/SPI and can have different values for each wave +/// of the grid dispatch. +/// +/// SGPR register initial state is defined as follows: +/// +/// Private Segment Buffer (enable_sgpr_private_segment_buffer): +/// Number of User SGPR registers: 4. V# that can be used, together with +/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg +/// segments using a segment address. It must be set as follows: +/// - Base address: of the scratch memory area used by the dispatch. It +/// does not include the scratch wave offset. It will be the per process +/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for +/// example there may be a per pipe offset, or per AQL Queue offset). +/// - Stride + data_format: Element Size * Index Stride (???) +/// - Cache swizzle: ??? +/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for +/// scratch) +/// - Num records: Flat Scratch Work Item Size / Element Size (???) +/// - Dst_sel_*: ??? +/// - Num_format: ??? +/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must +/// agree with amd_kernel_code_t.privateElementSize) +/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must +/// be number of wavefront lanes for scratch, must agree with +/// amd_kernel_code_t.wavefrontSize) +/// - Add tid enable: 1 +/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, +/// - Hash_enable: ??? +/// - Heap: ??? +/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE +/// - Type: 0 (a buffer) (???) +/// +/// Dispatch Ptr (enable_sgpr_dispatch_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet +/// for kernel actually executing. +/// +/// Queue Ptr (enable_sgpr_queue_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for +/// AQL queue on which the dispatch packet was queued. +/// +/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): +/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This +/// is directly copied from the kernargPtr in the dispatch packet. Having CP +/// load it once avoids loading it at the beginning of every wavefront. +/// +/// Dispatch Id (enable_sgpr_dispatch_id): +/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch +/// packet being executed. +/// +/// Flat Scratch Init (enable_sgpr_flat_scratch_init): +/// Number of User SGPR registers: 2. This is 2 SGPRs. +/// +/// For CI/VI: +/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE +/// to base of memory for scratch for this dispatch. This is the same offset +/// used in computing the Scratch Segment Buffer base address. The value of +/// Scratch Wave Offset must be added by the kernel code and moved to +/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. +/// +/// The second SGPR is 32 bit byte size of a single work-item’s scratch +/// memory usage. This is directly loaded from the dispatch packet Private +/// Segment Byte Size and rounded up to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in +/// flat memory instructions. Having CP load it once avoids loading it at +/// the beginning of every wavefront. +/// +/// For PI: +/// This is the 64 bit base address of the scratch backing memory for +/// allocated by CP for this dispatch. +/// +/// Private Segment Size (enable_sgpr_private_segment_size): +/// Number of User SGPR registers: 1. The 32 bit byte size of a single +/// work-item’s scratch memory allocation. This is the value from the dispatch +/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// Having CP load it once avoids loading it at the beginning of every +/// wavefront. +/// +/// \todo [This will not be used for CI/VI since it is the same value as +/// the second SGPR of Flat Scratch Init. However, it is need for PI which +/// changes meaning of Flat Scratchg Init..] +/// +/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the X dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). +/// +/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Y dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Z dimension for the grid being executed. Computed +/// from the fields in the HsaDispatchPacket as +/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Work-Group Id X (enable_sgpr_workgroup_id_x): +/// Number of System SGPR registers: 1. 32 bit work group id in X dimension +/// of grid for wavefront. Always present. +/// +/// Work-Group Id Y (enable_sgpr_workgroup_id_y): +/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension +/// of grid for wavefront. +/// +/// Work-Group Id Z (enable_sgpr_workgroup_id_z): +/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension +/// of grid for wavefront. If present then Work-group Id Y will also be +/// present +/// +/// Work-Group Info (enable_sgpr_workgroup_info): +/// Number of System SGPR registers: 1. {first_wave, 14’b0000, +/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} +/// +/// Private Segment Wave Byte Offset +/// (enable_sgpr_private_segment_wave_byte_offset): +/// Number of System SGPR registers: 1. 32 bit byte offset from base of +/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg +/// segment address when using Scratch Segment Buffer. It must be added to +/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. +/// +/// +/// The order of the VGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enableVgpr* bit fields. The register numbers used for enabled registers +/// are dense starting at VGPR0: the first enabled register is VGPR0, the next +/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR +/// number. +/// +/// VGPR register initial state is defined as follows: +/// +/// Work-Item Id X (always initialized): +/// Number of registers: 1. 32 bit work item id in X dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Y dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Z dimension of work-group +/// for wavefront lane. +/// +/// +/// The setting of registers is being done by existing GPU hardware as follows: +/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data +/// registers. +/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any +/// combination including none. +/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot +/// be added into the value Flat Scratch Offset which would avoid the +/// Finalizer generated prolog having to do the add. +/// 4) The VGPRs are set by SPI which only supports specifying either (X), +/// (X, Y) or (X, Y, Z). +/// +/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so +/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and +/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. +/// +/// The global segment can be accessed either using flat operations or buffer +/// operations. If buffer operations are used then the Global Buffer used to +/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a +/// segment address is not passed into the kernel code by CP since its base +/// address is always 0. Instead the Finalizer generates prolog code to +/// initialize 4 SGPRs with a V# that has the following properties, and then +/// uses that in the buffer instructions: +/// - base address of 0 +/// - no swizzle +/// - ATC=1 +/// - MTYPE set to support memory coherence specified in +/// amd_kernel_code_t.globalMemoryCoherence +/// +/// When the Global Buffer is used to access the Kernarg segment, must add the +/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. +/// Alternatively scalar loads can be used if the kernarg offset is uniform, as +/// the kernarg segment is constant for the duration of the kernel execution. +/// +typedef struct amd_kernel_code_s { + /// The AMD major version of the Code Object. Must be the value + /// AMD_CODE_VERSION_MAJOR. + amd_code_version32_t amd_code_version_major; + + /// The AMD minor version of the Code Object. Minor versions must be + /// backward compatible. Must be the value + /// AMD_CODE_VERSION_MINOR. + amd_code_version32_t amd_code_version_minor; + + /// The byte size of this struct. Must be set to + /// sizeof(amd_kernel_code_t). Used for backward + /// compatibility. + uint32_t struct_byte_size; + + /// The target chip instruction set for which code has been + /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration + /// in sc/Interface/SCCommon.h. + uint32_t target_chip; + + /// Byte offset (possibly negative) from start of amd_kernel_code_t + /// object to kernel's entry point instruction. The actual code for + /// the kernel is required to be 256 byte aligned to match hardware + /// requirements (SQ cache line is 16). The code must be position + /// independent code (PIC) for AMD devices to give runtime the + /// option of copying code to discrete GPU memory or APU L2 + /// cache. The Finalizer should endeavour to allocate all kernel + /// machine code in contiguous memory pages so that a device + /// pre-fetcher will tend to only pre-fetch Kernel Code objects, + /// improving cache performance. + int64_t kernel_code_entry_byte_offset; + + /// Range of bytes to consider prefetching expressed as an offset + /// and size. The offset is from the start (possibly negative) of + /// amd_kernel_code_t object. Set both to 0 if no prefetch + /// information is available. + /// + /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did + /// not make the size a uint64_t as prefetching more than 4GiB seems + /// excessive. + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + + /// Number of bytes of scratch backing memory required for full + /// occupancy of target chip. This takes into account the number of + /// bytes of scratch per work-item, the wavefront size, the maximum + /// number of wavefronts per CU, and the number of CUs. This is an + /// upper limit on scratch. If the grid being dispatched is small it + /// may only need less than this. If the kernel uses no scratch, or + /// the Finalizer has not computed this value, it must be 0. + uint64_t max_scratch_backing_memory_byte_size; + + /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and + /// COMPUTE_PGM_RSRC2 registers. + amd_compute_pgm_resource_register64_t compute_pgm_resource_registers; + + /// Code properties. See amd_code_property_mask_t for a full list of + /// properties. + amd_code_property32_t code_properties; + + /// The amount of memory required for the combined private, spill + /// and arg segments for a work-item in bytes. If + /// is_dynamic_callstack is 1 then additional space must be added to + /// this value for the call stack. + uint32_t workitem_private_segment_byte_size; + + /// The amount of group segment memory required by a work-group in + /// bytes. This does not include any dynamically allocated group + /// segment memory that may be added when the kernel is + /// dispatched. + uint32_t workgroup_group_segment_byte_size; + + /// Number of byte of GDS required by kernel dispatch. Must be 0 if + /// not using GDS. + uint32_t gds_segment_byte_size; + + /// The size in bytes of the kernarg segment that holds the values + /// of the arguments to the kernel. This could be used by CP to + /// prefetch the kernarg segment pointed to by the dispatch packet. + uint64_t kernarg_segment_byte_size; + + /// Number of fbarrier's used in the kernel and all functions it + /// calls. If the implementation uses group memory to allocate the + /// fbarriers then that amount must already be included in the + /// workgroup_group_segment_byte_size total. + uint32_t workgroup_fbarrier_count; + + /// Number of scalar registers used by a wavefront. This includes + /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size + /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a + /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. + uint16_t wavefront_sgpr_count; + + /// Number of vector registers used by each work-item. Used to set + /// COMPUTE_PGM_RSRC1.VGPRS. + uint16_t workitem_vgpr_count; + + /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed VGPR number reserved. + uint16_t reserved_vgpr_first; + + /// The number of consecutive VGPRs reserved by the client. If + /// is_debug_supported then this count includes VGPRs reserved + /// for debugger use. + uint16_t reserved_vgpr_count; + + /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed SGPR number reserved. + uint16_t reserved_sgpr_first; + + /// The number of consecutive SGPRs reserved by the client. If + /// is_debug_supported then this count includes SGPRs reserved + /// for debugger use. + uint16_t reserved_sgpr_count; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number used to hold the wave scratch offset for the + /// entire kernel execution, or uint16_t(-1) if the register is not + /// used or not known. + uint16_t debug_wavefront_private_segment_offset_sgpr; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number of the first of 4 SGPRs used to hold the + /// scratch V# used for the entire kernel execution, or uint16_t(-1) + /// if the registers are not used or not known. + uint16_t debug_private_segment_buffer_sgpr; + + /// The maximum byte alignment of variables used by the kernel in + /// the specified memory segment. Expressed as a power of two. Must + /// be at least HSA_POWERTWO_16. + hsa_powertwo8_t kernarg_segment_alignment; + hsa_powertwo8_t group_segment_alignment; + hsa_powertwo8_t private_segment_alignment; + + uint8_t reserved3; + + /// Type of code object. + hsa_ext_code_kind32_t code_type; + + /// Reserved for code properties if any are defined in the future. + /// There are currently no code properties so this field must be 0. + uint32_t reserved4; + + /// Wavefront size expressed as a power of two. Must be a power of 2 + /// in range 1..64 inclusive. Used to support runtime query that + /// obtains wavefront size, which may be used by application to + /// allocated dynamic group memory and set the dispatch work-group + /// size. + hsa_powertwo8_t wavefront_size; + + /// The optimization level specified when the kernel was + /// finalized. + uint8_t optimization_level; + + /// The HSAIL profile defines which features are used. This + /// information is from the HSAIL version directive. If this + /// amd_kernel_code_t is not generated from an HSAIL compilation + /// unit then must be 0. + hsa_ext_brig_profile8_t hsail_profile; + + /// The HSAIL machine model gives the address sizes used by the + /// code. This information is from the HSAIL version directive. If + /// not generated from an HSAIL compilation unit then must still + /// indicate for what machine mode the code is generated. + hsa_ext_brig_machine_model8_t hsail_machine_model; + + /// The HSAIL major version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_major; + + /// The HSAIL minor version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_minor; + + /// Reserved for HSAIL target options if any are defined in the + /// future. There are currently no target options so this field + /// must be 0. + uint16_t reserved5; + + /// Reserved. Must be 0. + uint16_t reserved6; + + /// The values should be the actually values used by the finalizer + /// in generating the code. This may be the union of values + /// specified as finalizer arguments and explicit HSAIL control + /// directives. If the finalizer chooses to ignore a control + /// directive, and not generate constrained code, then the control + /// directive should not be marked as enabled even though it was + /// present in the HSAIL or finalizer argument. The values are + /// intended to reflect the constraints that the code actually + /// requires to correctly execute, not the values that were + /// actually specified at finalize time. + hsa_ext_control_directives_t control_directive; + + /// The code can immediately follow the amd_kernel_code_t, or can + /// come after subsequent amd_kernel_code_t structs when there are + /// multiple kernels in the compilation unit. + +} amd_kernel_code_t; + +#endif // AMDKERNELCODET_H diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp index 7ad815d..3b4ba1a 100644 --- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp @@ -163,23 +163,22 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst Inst; switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { - default: break; - case Match_Success: - Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); - return false; - case Match_MissingFeature: - return Error(IDLoc, "instruction use requires an option to be enabled"); - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); - case Match_InvalidOperand: { - if (ErrorInfo != ~0ULL) { - if (ErrorInfo >= Operands.size()) - return Error(IDLoc, "too few operands for instruction"); - - } - return Error(IDLoc, "invalid operand for instruction"); + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction use requires an option to be enabled"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_InvalidOperand: { + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + } + return Error(IDLoc, "invalid operand for instruction"); + } } llvm_unreachable("Implement any new match types added!"); } @@ -312,6 +311,7 @@ bool AMDGPUOperand::isSWaitCnt() const { /// Force static initialization. extern "C" void LLVMInitializeR600AsmParser() { RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); + RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget); } #define GET_REGISTER_MATCHER diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td new file mode 100644 index 0000000..3ac7af8 --- /dev/null +++ b/lib/Target/R600/CIInstructions.td @@ -0,0 +1,42 @@ +//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// + + +def isCIVI : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isCIVI in { + +defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", + VOP_F64_F64, ftrunc +>; +defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", + VOP_F64_F64, fceil +>; +defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", + VOP_F64_F64, ffloor +>; +defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", + VOP_F64_F64, frint +>; +defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", + VOP_F32_F32 +>; +defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", + VOP_F32_F32 +>; +} // End SubtargetPredicate = isCIVI diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index ed0a216..5a4bae2 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp SIFixSGPRLiveRanges.cpp + SIFoldOperands.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp @@ -50,6 +51,7 @@ add_llvm_target(R600CodeGen SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp + SIPrepareScratchRegs.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td index 58b5ce2..ba4df82 100644 --- a/lib/Target/R600/CaymanInstructions.td +++ b/lib/Target/R600/CaymanInstructions.td @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -def isCayman : Predicate<"Subtarget.hasCaymanISA()">; +def isCayman : Predicate<"Subtarget->hasCaymanISA()">; //===----------------------------------------------------------------------===// // Cayman Instructions @@ -46,7 +46,7 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -defm : RsqPat<RECIPSQRT_IEEE_cm, f32>; +def : RsqPat<RECIPSQRT_IEEE_cm, f32>; def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index f24f76b..9f9472c 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -14,14 +14,14 @@ //===----------------------------------------------------------------------===// def isEG : Predicate< - "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "!Subtarget.hasCaymanISA()" + "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " + "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "!Subtarget->hasCaymanISA()" >; def isEGorCayman : Predicate< - "Subtarget.getGeneration() == AMDGPUSubtarget::EVERGREEN ||" - "Subtarget.getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" + "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" + "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" >; //===----------------------------------------------------------------------===// @@ -69,7 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -defm : RsqPat<RECIPSQRT_IEEE_eg, f32>; +def : RsqPat<RECIPSQRT_IEEE_eg, f32>; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; @@ -590,8 +590,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; -def : FROUNDPat <CNDGE_eg, CNDGT_eg>; - def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 0; // VALID_PIXEL_MODE diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 64fe726..b66ed10 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -9,11 +9,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstPrinter.h" -#include "SIDefines.h" - #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/MathExtras.h" @@ -74,7 +74,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset:"; - printU16ImmOperand(MI, OpNo, O); + printU16ImmDecOperand(MI, OpNo, O); } } @@ -208,7 +208,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } -void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) { +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); if (SImm >= -16 && SImm <= 64) { O << SImm; @@ -233,9 +233,37 @@ void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) { O << "4.0"; else if (Imm == FloatToBits(-4.0f)) O << "-4.0"; - else { + else O << formatHex(static_cast<uint64_t>(Imm)); +} + +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { + int64_t SImm = static_cast<int64_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; } + + if (Imm == DoubleToBits(0.0)) + O << "0.0"; + else if (Imm == DoubleToBits(1.0)) + O << "1.0"; + else if (Imm == DoubleToBits(-1.0)) + O << "-1.0"; + else if (Imm == DoubleToBits(0.5)) + O << "0.5"; + else if (Imm == DoubleToBits(-0.5)) + O << "-0.5"; + else if (Imm == DoubleToBits(2.0)) + O << "2.0"; + else if (Imm == DoubleToBits(-2.0)) + O << "-2.0"; + else if (Imm == DoubleToBits(4.0)) + O << "4.0"; + else if (Imm == DoubleToBits(-4.0)) + O << "-4.0"; + else + llvm_unreachable("64-bit literal constants not supported"); } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -253,14 +281,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, break; } } else if (Op.isImm()) { - printImmediate(Op.getImm(), O); + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int RCID = Desc.OpInfo[OpNo].RegClass; + if (RCID != -1) { + const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); + if (ImmRC.getSize() == 4) + printImmediate32(Op.getImm(), O); + else if (ImmRC.getSize() == 8) + printImmediate64(Op.getImm(), O); + else + llvm_unreachable("Invalid register class size"); + } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { + printImmediate32(Op.getImm(), O); + } else { + // We hit this for the immediate instruction bits that don't yet have a + // custom printer. + // TODO: Eventually this should be unnecessary. + O << formatDec(Op.getImm()); + } } else if (Op.isFPImm()) { - // We special case 0.0 because otherwise it will be printed as an integer. if (Op.getFPImm() == 0.0) O << "0.0"; - else - printImmediate(FloatToBits(Op.getFPImm()), O); + else { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); + + if (ImmRC.getSize() == 4) + printImmediate32(FloatToBits(Op.getFPImm()), O); + else if (ImmRC.getSize() == 8) + printImmediate64(DoubleToBits(Op.getFPImm()), O); + else + llvm_unreachable("Invalid register class size"); + } } else if (Op.isExpr()) { const MCExpr *Exp = Op.getExpr(); Exp->print(O); diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 4c06ac0..1d43c7a 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -48,7 +48,8 @@ private: void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); - void printImmediate(uint32_t Imm, raw_ostream &O); + void printImmediate32(uint32_t I, raw_ostream &O); + void printImmediate64(uint64_t I, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index 5fb311b..d0c634f 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -29,7 +29,7 @@ public: const MCAsmLayout &Layout) override { //XXX: Implement if necessary. } - void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout, + void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) override { diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 3c2b889..19d89fb 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -17,6 +17,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() { MaxInstLength = 16; SeparatorString = "\n"; CommentString = ";"; + PrivateLabelPrefix = ""; InlineAsmStart = ";#ASMSTART"; InlineAsmEnd = ";#ASMEND"; diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 8731055..83403ba 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -15,6 +15,7 @@ #include "AMDGPUMCTargetDesc.h" #include "AMDGPUMCAsmInfo.h" #include "InstPrinter/AMDGPUInstPrinter.h" +#include "SIDefines.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -92,20 +93,29 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, extern "C" void LLVMInitializeR600TargetMC() { RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget); + RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget); TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo); TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo); TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter); TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter); TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend); TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer); } diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index c019766..bc8cd53 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -30,6 +30,7 @@ class Target; class raw_ostream; extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index dc1344f..8a555ff 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -30,8 +30,8 @@ using namespace llvm; namespace { class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { - R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION; - void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION; + R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; + void operator=(const R600MCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index 999fd0d..7e23772 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -14,10 +14,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "SIDefines.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" @@ -31,15 +31,9 @@ using namespace llvm; namespace { -/// \brief Helper type used in encoding -typedef union { - int32_t I; - float F; -} IntFloatUnion; - class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { - SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; - void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; + SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; + void operator=(const SIMCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; MCContext &Ctx; @@ -48,7 +42,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; /// \brief Encode an fp or int literal - uint32_t getLitEncoding(const MCOperand &MO) const; + uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, @@ -85,60 +79,107 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const { - unsigned RegClass = Desc.OpInfo[OpNo].RegClass; - return (AMDGPU::SSrc_32RegClassID == RegClass) || - (AMDGPU::SSrc_64RegClassID == RegClass) || - (AMDGPU::VSrc_32RegClassID == RegClass) || - (AMDGPU::VSrc_64RegClassID == RegClass) || - (AMDGPU::VCSrc_32RegClassID == RegClass) || - (AMDGPU::VCSrc_64RegClassID == RegClass); + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + + return OpType == AMDGPU::OPERAND_REG_IMM32 || + OpType == AMDGPU::OPERAND_REG_INLINE_C; } -uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { +// Returns the encoding value to use if the given integer is an integer inline +// immediate value, or 0 if it is not. +template <typename IntTy> +static uint32_t getIntInlineImmEncoding(IntTy Imm) { + if (Imm >= 0 && Imm <= 64) + return 128 + Imm; - IntFloatUnion Imm; - if (MO.isImm()) - Imm.I = MO.getImm(); - else if (MO.isFPImm()) - Imm.F = MO.getFPImm(); - else if (MO.isExpr()) - return 255; - else - return ~0; + if (Imm >= -16 && Imm <= -1) + return 192 + std::abs(Imm); - if (Imm.I >= 0 && Imm.I <= 64) - return 128 + Imm.I; + return 0; +} - if (Imm.I >= -16 && Imm.I <= -1) - return 192 + abs(Imm.I); +static uint32_t getLit32Encoding(uint32_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); + if (IntImm != 0) + return IntImm; - if (Imm.F == 0.5f) + if (Val == FloatToBits(0.5f)) return 240; - if (Imm.F == -0.5f) + if (Val == FloatToBits(-0.5f)) return 241; - if (Imm.F == 1.0f) + if (Val == FloatToBits(1.0f)) return 242; - if (Imm.F == -1.0f) + if (Val == FloatToBits(-1.0f)) return 243; - if (Imm.F == 2.0f) + if (Val == FloatToBits(2.0f)) return 244; - if (Imm.F == -2.0f) + if (Val == FloatToBits(-2.0f)) return 245; - if (Imm.F == 4.0f) + if (Val == FloatToBits(4.0f)) return 246; - if (Imm.F == -4.0f) + if (Val == FloatToBits(-4.0f)) return 247; return 255; } +static uint32_t getLit64Encoding(uint64_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == DoubleToBits(0.5)) + return 240; + + if (Val == DoubleToBits(-0.5)) + return 241; + + if (Val == DoubleToBits(1.0)) + return 242; + + if (Val == DoubleToBits(-1.0)) + return 243; + + if (Val == DoubleToBits(2.0)) + return 244; + + if (Val == DoubleToBits(-2.0)) + return 245; + + if (Val == DoubleToBits(4.0)) + return 246; + + if (Val == DoubleToBits(-4.0)) + return 247; + + return 255; +} + +uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, + unsigned OpSize) const { + if (MO.isExpr()) + return 255; + + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + if (OpSize == 4) + return getLit32Encoding(static_cast<uint32_t>(MO.getImm())); + + assert(OpSize == 8); + + return getLit64Encoding(static_cast<uint64_t>(MO.getImm())); +} + void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -161,25 +202,24 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, if (!isSrcOperand(Desc, i)) continue; + int RCID = Desc.OpInfo[i].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + // Is this operand a literal immediate? const MCOperand &Op = MI.getOperand(i); - if (getLitEncoding(Op) != 255) + if (getLitEncoding(Op, RC.getSize()) != 255) continue; // Yes! Encode it - IntFloatUnion Imm; + int64_t Imm = 0; + if (Op.isImm()) - Imm.I = Op.getImm(); - else if (Op.isFPImm()) - Imm.F = Op.getFPImm(); - else { - assert(Op.isExpr()); - // This will be replaced with a fixup value. - Imm.I = 0; - } + Imm = Op.getImm(); + else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + llvm_unreachable("Must be immediate or expr"); for (unsigned j = 0; j < 4; j++) { - OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff)); + OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); } // Only one literal value allowed @@ -234,7 +274,10 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (isSrcOperand(Desc, OpNo)) { - uint32_t Enc = getLitEncoding(MO); + int RCID = Desc.OpInfo[OpNo].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + uint32_t Enc = getLitEncoding(MO, RC.getSize()); if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) return Enc; diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index ce17d7c..fb5aa61 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -83,28 +83,44 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"SI", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; -def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; -def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; //===----------------------------------------------------------------------===// // Sea Islands //===----------------------------------------------------------------------===// -def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>; -def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureSeaIslands, FeatureFastFMAF32] +>; -def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>; + +//===----------------------------------------------------------------------===// +// Volcanic Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"tonga", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index edaf278..c8f37f6 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -39,14 +39,14 @@ struct CFStack { FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 }; - const AMDGPUSubtarget &ST; + const AMDGPUSubtarget *ST; std::vector<StackItem> BranchStack; std::vector<StackItem> LoopStack; unsigned MaxStackSize; unsigned CurrentEntries; unsigned CurrentSubEntries; - CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), + CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), CurrentEntries(0), CurrentSubEntries(0) { } @@ -76,11 +76,11 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) { } bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { - if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && getLoopDepth() > 1) return true; - if (!ST.hasCFAluBug()) + if (!ST->hasCFAluBug()) return false; switch(Opcode) { @@ -91,7 +91,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { case AMDGPU::CF_ALU_CONTINUE: if (CurrentSubEntries == 0) return false; - if (ST.getWavefrontSize() == 64) { + if (ST->getWavefrontSize() == 64) { // We are being conservative here. We only require this work-around if // CurrentSubEntries > 3 && // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) @@ -102,7 +102,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { // resources without any problems. return CurrentSubEntries > 3; } else { - assert(ST.getWavefrontSize() == 32); + assert(ST->getWavefrontSize() == 32); // We are being conservative here. We only require the work-around if // CurrentSubEntries > 7 && // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) @@ -118,8 +118,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { default: return 0; case CFStack::FIRST_NON_WQM_PUSH: - assert(!ST.hasCaymanISA()); - if (ST.getGeneration() <= AMDGPUSubtarget::R700) { + assert(!ST->hasCaymanISA()); + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { // +1 For the push operation. // +2 Extra space required. return 3; @@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 2; } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); // +1 For the push operation. // +1 Extra space required. return 2; @@ -153,13 +153,14 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) { case AMDGPU::CF_PUSH_EG: case AMDGPU::CF_ALU_PUSH_BEFORE: if (!isWQM) { - if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + if (!ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI // See comment in // CFStack::getSubEntrySize() else if (CurrentEntries > 0 && - ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && - !ST.hasCaymanISA() && + ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; else @@ -219,7 +220,7 @@ private: const R600InstrInfo *TII; const R600RegisterInfo *TRI; unsigned MaxFetchInst; - const AMDGPUSubtarget &ST; + const AMDGPUSubtarget *ST; bool IsTrivialInst(MachineInstr *MI) const { switch (MI->getOpcode()) { @@ -233,7 +234,7 @@ private: const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; - bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); switch (CFI) { case CF_TC: Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; @@ -266,7 +267,7 @@ private: Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; break; case CF_END: - if (ST.hasCaymanISA()) { + if (ST->hasCaymanISA()) { Opcode = AMDGPU::CF_END_CM; break; } @@ -467,17 +468,14 @@ private: } public: - R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), - TII (nullptr), TRI(nullptr), - ST(tm.getSubtarget<AMDGPUSubtarget>()) { - const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); - MaxFetchInst = ST.getTexVTXClauseSize(); - } + R600ControlFlowFinalizer(TargetMachine &tm) + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); - TRI = static_cast<const R600RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + ST = &MF.getSubtarget<AMDGPUSubtarget>(); + MaxFetchInst = ST->getTexVTXClauseSize(); + TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); CFStack CFStack(ST, MFI->getShaderType()); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a214e53..c738611 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -30,9 +30,9 @@ using namespace llvm; -R600TargetLowering::R600TargetLowering(TargetMachine &TM) : - AMDGPUTargetLowering(TM), - Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { +R600TargetLowering::R600TargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); @@ -40,7 +40,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); - computeRegisterProperties(); + computeRegisterProperties(STI.getRegisterInfo()); // Set condition code actions setCondCodeAction(ISD::SETO, MVT::f32, Expand); @@ -122,12 +122,19 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); @@ -181,8 +188,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SUBE, VT, Expand); } - setBooleanContents(ZeroOrNegativeOneBooleanContent); - setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::Source); } @@ -192,7 +197,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = *MI; const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); switch (MI->getOpcode()) { default: @@ -647,9 +652,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); MachineSDNode *interp; if (ijb < 0) { - const MachineFunction &MF = DAG.getMachineFunction(); - const R600InstrInfo *TII = static_cast<const R600InstrInfo *>( - MF.getSubtarget().getInstrInfo()); + const R600InstrInfo *TII = + static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); return DAG.getTargetExtractSubreg( @@ -1115,6 +1119,13 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const SDValue CC = Op.getOperand(4); SDValue Temp; + if (VT == MVT::f32) { + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + if (MinMax) + return MinMax; + } + // LHS and RHS are guaranteed to be the same value type EVT CompareVT = LHS.getValueType(); @@ -1369,8 +1380,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - getTargetMachine().getSubtargetImpl()->getFrameLowering()); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1567,8 +1578,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - getTargetMachine().getSubtargetImpl()->getFrameLowering()); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1682,7 +1693,7 @@ SDValue R600TargetLowering::LowerFormalArguments( // XXX - I think PartOffset should give you this, but it seems to give the // size of the register which isn't useful. - unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset(); + unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); unsigned Offset = 36 + VA.getLocMemOffset(); @@ -2172,9 +2183,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, unsigned Opcode = Node->getMachineOpcode(); SDValue FakeOp; - std::vector<SDValue> Ops; - for (const SDUse &I : Node->ops()) - Ops.push_back(I); + std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); if (Opcode == AMDGPU::DOT_4) { int OperandIdx[] = { @@ -2236,10 +2245,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, AMDGPU::OpName::clamp); if (ClampIdx < 0) return Node; - std::vector<SDValue> Ops; - unsigned NumOp = Src.getNumOperands(); - for(unsigned i = 0; i < NumOp; ++i) - Ops.push_back(Src.getOperand(i)); + std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), Node->getVTList(), Ops); diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index 10ebc10..c547195 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -23,7 +23,7 @@ class R600InstrInfo; class R600TargetLowering : public AMDGPUTargetLowering { public: - R600TargetLowering(TargetMachine &TM); + R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock * BB) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index b6c00f8..291fb04 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -335,10 +335,11 @@ def load_param : LoadParamFrag<load>; def load_param_exti8 : LoadParamFrag<az_extloadi8>; def load_param_exti16 : LoadParamFrag<az_extloadi16>; -def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">; +def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; -def isR600toCayman : Predicate< - "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; +def isR600toCayman + : Predicate< + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; //===----------------------------------------------------------------------===// // R600 SDNodes @@ -579,6 +580,7 @@ i32imm:$COUNT, i32imm:$Enabled), let ALT_CONST = 0; let WHOLE_QUAD_MODE = 0; let BARRIER = 1; + let isCodeGenOnly = 1; let UseNamedOperandTable = 1; let Inst{31-0} = Word0; @@ -641,6 +643,7 @@ def FETCH_CLAUSE : AMDGPUInst <(outs), field bits<8> Inst; bits<8> num; let Inst = num; + let isCodeGenOnly = 1; } def ALU_CLAUSE : AMDGPUInst <(outs), @@ -648,10 +651,13 @@ def ALU_CLAUSE : AMDGPUInst <(outs), field bits<8> Inst; bits<8> num; let Inst = num; + let isCodeGenOnly = 1; } def LITERALS : AMDGPUInst <(outs), (ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + let isCodeGenOnly = 1; + field bits<64> Inst; bits<32> literal1; bits<32> literal2; @@ -698,7 +704,7 @@ def SGE : R600_2OP < def SNE : R600_2OP < 0xB, "SETNE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] >; def SETE_DX10 : R600_2OP < @@ -716,9 +722,10 @@ def SETGE_DX10 : R600_2OP < [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] >; +// FIXME: This should probably be COND_ONE def SETNE_DX10 : R600_2OP < 0xF, "SETNE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; @@ -913,7 +920,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP < class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < inst, "MULADD_IEEE", - [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] + [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] >; class FMA_Common <bits<5> inst> : R600_3OP < @@ -1141,16 +1148,6 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) >; -// FROUND pattern -class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat < - (AMDGPUround f32:$x), - (CNDGE $x, - (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)), - (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) - ) ->; - - //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1192,9 +1189,7 @@ let Predicates = [isR600] in { def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - defm : RsqPat<RECIPSQRT_IEEE_r600, f32>; - - def : FROUNDPat <CNDGE_r600, CNDGT_r600>; + def : RsqPat<RECIPSQRT_IEEE_r600, f32>; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT @@ -1248,6 +1243,7 @@ let Predicates = [isR600] in { def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), "PUSH_ELSE @$ADDR"> { let CNT = 0; + let POP_COUNT = 0; // FIXME? } def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> { @@ -1364,7 +1360,7 @@ def CONST_COPY : Instruction { let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; let AsmString = "CONST_COPY"; - let neverHasSideEffects = 1; + let hasSideEffects = 0; let isAsCheapAsAMove = 1; let Itinerary = NullALU; } diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index d782713..bcde5fb 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -16,7 +16,7 @@ #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -26,17 +26,16 @@ using namespace llvm; void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); DAG = static_cast<ScheduleDAGMILive*>(dag); + const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>(); TII = static_cast<const R600InstrInfo*>(DAG->TII); TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); - VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + VLIW5 = !ST.hasCaymanISA(); MRI = &DAG->MRI; CurInstKind = IDOther; CurEmitted = 0; OccupedSlotsMask = 31; InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); InstKindLimit[IDOther] = 32; - - const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); AluInstCount = 0; FetchInstCount = 0; diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index ddf68c9..deee5bc 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -153,7 +153,7 @@ public: TII(static_cast<const R600InstrInfo *>( MF.getSubtarget().getInstrInfo())), TRI(TII->getRegisterInfo()) { - VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); } // initPacketizerState - initialize some internal flags. diff --git a/lib/Target/R600/R700Instructions.td b/lib/Target/R600/R700Instructions.td index 9aad85d..613a0d7 100644 --- a/lib/Target/R600/R700Instructions.td +++ b/lib/Target/R600/R700Instructions.td @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">; +def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; let Predicates = [isR700] in { def SIN_r700 : SIN_Common<0x6E>; diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp index 91eb60b..79f6532 100644 --- a/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -66,6 +67,8 @@ class SIAnnotateControlFlow : public FunctionPass { DominatorTree *DT; StackVector Stack; + LoopInfo *LI; + bool isTopOfStack(BasicBlock *BB); Value *popSaved(); @@ -99,6 +102,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); @@ -277,10 +281,26 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); push(Term->getSuccessor(0), Arg); -} - -/// \brief Close the last opened control flow +}/// \brief Close the last opened control flow void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { + llvm::Loop *L = LI->getLoopFor(BB); + + if (L && L->getHeader() == BB) { + // We can't insert an EndCF call into a loop header, because it will + // get executed on every iteration of the loop, when it should be + // executed only once before the loop. + SmallVector <BasicBlock*, 8> Latches; + L->getLoopLatches(Latches); + + std::vector<BasicBlock*> Preds; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { + if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) + Preds.push_back(*PI); + } + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, + LI, false); + } + CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); } @@ -288,6 +308,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 2e7dab6..b540140 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -8,25 +8,49 @@ /// \file //===----------------------------------------------------------------------===// +#include "llvm/MC/MCInstrDesc.h" + #ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H #define LLVM_LIB_TARGET_R600_SIDEFINES_H namespace SIInstrFlags { // This needs to be kept in sync with the field bits in InstSI. enum { - MIMG = 1 << 3, - SMRD = 1 << 4, - VOP1 = 1 << 5, - VOP2 = 1 << 6, - VOP3 = 1 << 7, - VOPC = 1 << 8, - SALU = 1 << 9, - MUBUF = 1 << 10, - MTBUF = 1 << 11, - FLAT = 1 << 12 + SALU = 1 << 3, + VALU = 1 << 4, + + SOP1 = 1 << 5, + SOP2 = 1 << 6, + SOPC = 1 << 7, + SOPK = 1 << 8, + SOPP = 1 << 9, + + VOP1 = 1 << 10, + VOP2 = 1 << 11, + VOP3 = 1 << 12, + VOPC = 1 << 13, + + MUBUF = 1 << 14, + MTBUF = 1 << 15, + SMRD = 1 << 16, + DS = 1 << 17, + MIMG = 1 << 18, + FLAT = 1 << 19, + WQM = 1 << 20 }; } +namespace llvm { +namespace AMDGPU { + enum OperandType { + /// Operand with register or 32-bit immediate + OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + /// Operand with register or inline constant + OPERAND_REG_INLINE_C + }; +} +} + namespace SIInstrFlags { enum Flags { // First 4 bits are the instruction encoding @@ -34,6 +58,21 @@ namespace SIInstrFlags { EXP_CNT = 1 << 1, LGKM_CNT = 1 << 2 }; + + // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. + // The result is true if any of these tests are true. + enum ClassFlags { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; } namespace SISrcMods { @@ -61,7 +100,14 @@ namespace SIOutMods { #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C -#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) + #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC @@ -118,4 +164,8 @@ namespace SIOutMods { #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 #define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 +#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) + + #endif diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp index d6f4b4c..cd1b3ac 100644 --- a/lib/Target/R600/SIFixSGPRCopies.cpp +++ b/lib/Target/R600/SIFixSGPRCopies.cpp @@ -136,12 +136,12 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( const MachineRegisterInfo &MRI, unsigned Reg, unsigned SubReg) const { - // The Reg parameter to the function must always be defined by either a PHI - // or a COPY, therefore it cannot be a physical register. - assert(TargetRegisterInfo::isVirtualRegister(Reg) && - "Reg cannot be a physical register"); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); + const TargetRegisterClass *RC + = TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + TRI->getRegClass(Reg); + RC = TRI->getSubRegClass(RC, SubReg); for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { @@ -182,7 +182,12 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, unsigned DstReg = Copy.getOperand(0).getReg(); unsigned SrcReg = Copy.getOperand(1).getReg(); unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + + const TargetRegisterClass *DstRC + = TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI->getRegClass(DstReg); + const TargetRegisterClass *SrcRC; if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || @@ -217,20 +222,21 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { default: continue; case AMDGPU::PHI: { - DEBUG(dbgs() << " Fixing PHI:\n"); - DEBUG(MI.print(dbgs())); + DEBUG(dbgs() << "Fixing PHI: " << MI); - for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { - unsigned Reg = MI.getOperand(i).getReg(); - const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - MRI.constrainRegClass(Reg, RC); + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + const MachineOperand &Op = MI.getOperand(i); + unsigned Reg = Op.getReg(); + const TargetRegisterClass *RC + = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + + MRI.constrainRegClass(Op.getReg(), RC); } unsigned Reg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass); + if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { + MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); } if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp new file mode 100644 index 0000000..ae4b05d --- /dev/null +++ b/lib/Target/R600/SIFoldOperands.cpp @@ -0,0 +1,287 @@ +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-fold-operands" +using namespace llvm; + +namespace { + +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fold Operands"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + unsigned UseOpNo; + MachineOperand *OpToFold; + uint64_t ImmToFold; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : + UseMI(MI), UseOpNo(OpNo) { + + if (FoldOp->isImm()) { + OpToFold = nullptr; + ImmToFold = FoldOp->getImm(); + } else { + assert(FoldOp->isReg()); + OpToFold = FoldOp; + } + } + + bool isImm() const { + return !OpToFold; + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) + +char SIFoldOperands::ID = 0; + +char &llvm::SIFoldOperandsID = SIFoldOperands::ID; + +FunctionPass *llvm::createSIFoldOperandsPass() { + return new SIFoldOperands(); +} + +static bool isSafeToFold(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + +static bool updateOperand(FoldCandidate &Fold, + const TargetRegisterInfo &TRI) { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } + + MachineOperand *New = Fold.OpToFold; + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && + TargetRegisterInfo::isVirtualRegister(New->getReg())) { + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + return true; + } + + // FIXME: Handle physical registers. + + return false; +} + +static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *OpToFold, + const SIInstrInfo *TII) { + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + // Operand is not legal, so try to commute the instruction to + // see if this makes it possible to fold. + unsigned CommuteIdx0; + unsigned CommuteIdx1; + bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + + if (CanCommute) { + if (CommuteIdx0 == OpNo) + OpNo = CommuteIdx1; + else if (CommuteIdx1 == OpNo) + OpNo = CommuteIdx0; + } + + if (!CanCommute || !TII->commuteInstruction(MI)) + return false; + + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + return true; +} + +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (!isSafeToFold(MI.getOpcode())) + continue; + + unsigned OpSize = TII->getOpSize(MI, 1); + MachineOperand &OpToFold = MI.getOperand(1); + bool FoldingImm = OpToFold.isImm(); + + // FIXME: We could also be folding things like FrameIndexes and + // TargetIndexes. + if (!FoldingImm && !OpToFold.isReg()) + continue; + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && + !MRI.hasOneUse(MI.getOperand(0).getReg())) + continue; + + // FIXME: Fold operands with subregs. + if (OpToFold.isReg() && + (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || + OpToFold.getSubReg())) + continue; + + std::vector<FoldCandidate> FoldList; + for (MachineRegisterInfo::use_iterator + Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); + Use != E; ++Use) { + + MachineInstr *UseMI = Use->getParent(); + const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + continue; + } + + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + // Split 64-bit constants into 32-bits for folding. + if (UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + continue; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + continue; + + UseMI->setDesc(TII->get(MovOp)); + } + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) + continue; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); + continue; + } + + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + } + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, TRI)) { + // Clear kill flags. + if (!Fold.isImm()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + Fold.OpToFold->setIsKill(false); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); + } + } + } + } + return false; +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 8d4164a..7d794b8 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -35,8 +35,9 @@ using namespace llvm; -SITargetLowering::SITargetLowering(TargetMachine &TM) : - AMDGPUTargetLowering(TM) { +SITargetLowering::SITargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -44,7 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); @@ -59,22 +60,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - computeRegisterProperties(); - - // Condition Codes - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::f32, Expand); - - setCondCodeAction(ISD::SETONE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); - setCondCodeAction(ISD::SETULE, MVT::f64, Expand); - setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + computeRegisterProperties(STI.getRegisterInfo()); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); @@ -104,12 +90,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setOperationAction(ISD::SELECT, MVT::f32, Promote); - AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -147,26 +129,34 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); + } + + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); @@ -213,13 +203,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } } - for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { - MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); - setOperationAction(ISD::FTRUNC, VT, Expand); - setOperationAction(ISD::FCEIL, VT, Expand); - setOperationAction(ISD::FFLOOR, VT, Expand); - } - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -228,6 +211,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); @@ -235,7 +219,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); - + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); // All memory operations. Some folding on the pointer operand is done to help @@ -315,7 +300,7 @@ bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, bool *IsFast) const { @@ -327,9 +312,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // XXX - CI changes say "Support for unaligned memory accesses" but I don't - // see what for specifically. The wording everywhere else seems to be the - // same. + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the @@ -341,12 +325,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return Align % 4 == 0; } + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. // This applies to private, global, and constant memory. if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32); + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; } EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, @@ -379,8 +369,8 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); return TII->isInlineConstant(Imm); } @@ -413,16 +403,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, } SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - - const TargetMachine &TM = getTargetMachine(); + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); @@ -461,7 +446,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, // NOT four or eigth. - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); for (unsigned j = 0; j != NumElements; ++j) { @@ -489,7 +474,10 @@ SDValue SITargetLowering::LowerFormalArguments( // The pointer to the list of arguments is stored in SGPR0, SGPR1 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { - Info->NumUserSGPRs = 4; + if (Subtarget->isAmdHsaOS()) + Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + else + Info->NumUserSGPRs = 4; unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); @@ -541,7 +529,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); const PointerType *ParamTy = - dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always @@ -576,7 +564,7 @@ SDValue SITargetLowering::LowerFormalArguments( if (Arg.VT.isVector()) { // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); SmallVector<SDValue, 4> Regs; @@ -589,8 +577,7 @@ SDValue SITargetLowering::LowerFormalArguments( // Fill up the missing vector elements NumElements = Arg.VT.getVectorNumElements() - NumElements; - for (unsigned j = 0; j != NumElements; ++j) - Regs.push_back(DAG.getUNDEF(VT)); + Regs.append(NumElements, DAG.getUNDEF(VT)); InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); continue; @@ -598,6 +585,12 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } + + if (Info->getShaderType() != ShaderType::COMPUTE) { + unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( + AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); + Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + } return Chain; } @@ -605,25 +598,14 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - case AMDGPU::BRANCH: return BB; - case AMDGPU::V_SUB_F64: { - unsigned DestReg = MI->getOperand(0).getReg(); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) - .addImm(0) // SRC0 modifiers - .addReg(MI->getOperand(1).getReg()) - .addImm(1) // SRC1 modifiers - .addReg(MI->getOperand(2).getReg()) - .addImm(0) // CLAMP - .addImm(0); // OMOD - MI->eraseFromParent(); - break; - } + case AMDGPU::BRANCH: + return BB; case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -640,17 +622,43 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return BB; } -EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + +EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { return MVT::i1; } - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); } MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { return MVT::i32; } +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); @@ -659,7 +667,11 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: - return false; /* There is V_MAD_F32 for f32 */ + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); case MVT::f64: return true; default: @@ -755,15 +767,12 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); // Build the result and - SmallVector<EVT, 4> Res; - for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) - Res.push_back(Intr->getValueType(i)); + ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); // operands of the new intrinsic call SmallVector<SDValue, 4> Ops; Ops.push_back(BRCOND.getOperand(0)); - for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) - Ops.push_back(Intr->getOperand(i)); + Ops.append(Intr->op_begin() + 1, Intr->op_end()); Ops.push_back(Target); // build the new intrinsic call @@ -839,7 +848,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -889,13 +898,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { @@ -1090,7 +1099,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const APFloat K1Val(BitsToFloat(0x2f800000)); const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); - const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, MVT::f32); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); @@ -1108,7 +1117,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(); + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); } SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { @@ -1129,11 +1201,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Store->getMemoryVT(); // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - VT.isVector() && VT.getVectorNumElements() == 2 && - VT.getVectorElementType() == MVT::i32) - return SDValue(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { if (VT.isVector() && VT.getVectorNumElements() > 4) return ScalarizeVectorStore(Op, DAG); @@ -1177,7 +1244,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { //===----------------------------------------------------------------------===// SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) { + DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); if (ScalarVT != MVT::f32) @@ -1225,8 +1292,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast<LoadSDNode>(Src); + + unsigned AS = Load->getAddressSpace(); + unsigned Align = Load->getAlignment(); + Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + + // Don't try to replace the load if we have to expand it due to alignment + // problems. Otherwise we will end up scalarizing the load, and trying to + // repack into the vector for no real reason. + if (Align < ABIAlignment && + !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { + return SDValue(); + } + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, Load->getChain(), Load->getBasePtr(), @@ -1297,8 +1377,8 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, if (!CAdd) return SDValue(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); // If the resulting offset is too large, we can't fold it into the addressing // mode offset. @@ -1316,6 +1396,102 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +SDValue SITargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::SETCC && + RHS.getOpcode() == ISD::SETCC) { + ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); + + SDValue X = LHS.getOperand(0); + SDValue Y = RHS.getOperand(0); + if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) + return SDValue(); + + if (LCC == ISD::SETO) { + if (X != LHS.getOperand(1)) + return SDValue(); + + if (RCC == ISD::SETUNE) { + const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); + if (!C1 || !C1->isInfinity() || C1->isNegative()) + return SDValue(); + + const uint32_t Mask = SIInstrFlags::N_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::P_NORMAL; + + static_assert(((~(SIInstrFlags::S_NAN | + SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | + SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, + "mask not equal"); + + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + X, DAG.getConstant(Mask, MVT::i32)); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + Src, DAG.getConstant(NewMask, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, MVT::i1); + } + + return SDValue(); +} + static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -1371,33 +1547,47 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performSetCCCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = LHS.getValueType(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Match isinf pattern + // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if (!CRHS) + return SDValue(); + + const APFloat &APF = CRHS->getValueAPF(); + if (APF.isInfinity() && !APF.isNegative()) { + unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, + LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32)); + } + } + + return SDValue(); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); - EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SETCC: { - SDValue Arg0 = N->getOperand(0); - SDValue Arg1 = N->getOperand(1); - SDValue CC = N->getOperand(2); - ConstantSDNode * C = nullptr; - ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); - - // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) - if (VT == MVT::i1 - && Arg0.getOpcode() == ISD::SIGN_EXTEND - && Arg0.getOperand(0).getValueType() == MVT::i1 - && (C = dyn_cast<ConstantSDNode>(Arg1)) - && C->isNullValue() - && CCOp == ISD::SETNE) { - return SimplifySetCC(VT, Arg0.getOperand(0), - DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); - } - break; - } + default: + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::SETCC: + return performSetCCCombine(N, DCI); case ISD::FMAXNUM: // TODO: What about fmax_legacy? case ISD::FMINNUM: case AMDGPUISD::SMAX: @@ -1442,6 +1632,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::f32) break; + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -1452,8 +1647,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS); + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); } } @@ -1461,12 +1656,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS); + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); } } - break; + return SDValue(); } case ISD::FSUB: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) @@ -1476,39 +1671,22 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, // Try to get the fneg to fold into the source modifier. This undoes generic // DAG combines and folds them into the mad. - if (VT == MVT::f32) { + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - - if (LHS.getOpcode() == ISD::FMUL) { - // (fsub (fmul a, b), c) -> mad a, b, (fneg c) - - SDValue A = LHS.getOperand(0); - SDValue B = LHS.getOperand(1); - SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - - if (RHS.getOpcode() == ISD::FMUL) { - // (fsub c, (fmul a, b)) -> mad (fneg a), b, c - - SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); - SDValue B = RHS.getOperand(1); - SDValue C = LHS; - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - if (LHS.getOpcode() == ISD::FADD) { // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); } } @@ -1517,10 +1695,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue NegTwo = DAG.getTargetConstantFP(-2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS); + const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); } } + + return SDValue(); } break; @@ -1554,9 +1734,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); if (NewPtr) { - SmallVector<SDValue, 8> NewOps; - for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I) - NewOps.push_back(MemNode->getOperand(I)); + SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); @@ -1564,287 +1742,44 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + case ISD::AND: + return performAndCombine(N, DCI); + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -/// \brief Test if RegClass is one of the VSrc classes -static bool isVSrc(unsigned RegClass) { - switch(RegClass) { - default: return false; - case AMDGPU::VSrc_32RegClassID: - case AMDGPU::VCSrc_32RegClassID: - case AMDGPU::VSrc_64RegClassID: - case AMDGPU::VCSrc_64RegClassID: - return true; - } -} - -/// \brief Test if RegClass is one of the SSrc classes -static bool isSSrc(unsigned RegClass) { - return AMDGPU::SSrc_32RegClassID == RegClass || - AMDGPU::SSrc_64RegClassID == RegClass; -} - /// \brief Analyze the possible immediate value Op /// /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - union { - int32_t I; - float F; - } Imm; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { - if (Node->getZExtValue() >> 32) { - return -1; - } - Imm.I = Node->getSExtValue(); - } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { - if (N->getValueType(0) != MVT::f32) - return -1; - Imm.F = Node->getValueAPF().convertToFloat(); - } else - return -1; // It isn't an immediate - - if ((Imm.I >= -16 && Imm.I <= 64) || - Imm.F == 0.5f || Imm.F == -0.5f || - Imm.F == 1.0f || Imm.F == -1.0f || - Imm.F == 2.0f || Imm.F == -2.0f || - Imm.F == 4.0f || Imm.F == -4.0f) - return 0; // It's an inline immediate - - return Imm.I; // It's a literal immediate -} - -/// \brief Try to fold an immediate directly into an instruction -bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, - bool &ScalarSlotUsed) const { - - MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - if (!Mov || !TII->isMov(Mov->getMachineOpcode())) - return false; - - const SDValue &Op = Mov->getOperand(0); - int32_t Value = analyzeImmediate(Op.getNode()); - if (Value == -1) { - // Not an immediate at all - return false; - - } else if (Value == 0) { - // Inline immediates can always be fold - Operand = Op; - return true; - - } else if (Value == Immediate) { - // Already fold literal immediate - Operand = Op; - return true; - - } else if (!ScalarSlotUsed && !Immediate) { - // Fold this literal immediate - ScalarSlotUsed = true; - Immediate = Value; - Operand = Op; - return true; + if (TII->isInlineConstant(Node->getAPIntValue())) + return 0; + uint64_t Val = Node->getZExtValue(); + return isUInt<32>(Val) ? Val : -1; } - return false; -} + if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) + return 0; -const TargetRegisterClass *SITargetLowering::getRegClassForNode( - SelectionDAG &DAG, const SDValue &Op) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - if (!Op->isMachineOpcode()) { - switch(Op->getOpcode()) { - case ISD::CopyFromReg: { - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - return MRI.getRegClass(Reg); - } - return TRI.getPhysRegClass(Reg); - } - default: return nullptr; - } - } - const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); - int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; - if (OpClassID != -1) { - return TRI.getRegClass(OpClassID); - } - switch(Op.getMachineOpcode()) { - case AMDGPU::COPY_TO_REGCLASS: - // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. - OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); - - // If the COPY_TO_REGCLASS instruction is copying to a VSrc register - // class, then the register class for the value could be either a - // VReg or and SReg. In order to get a more accurate - if (isVSrc(OpClassID)) - return getRegClassForNode(DAG, Op.getOperand(0)); - - return TRI.getRegClass(OpClassID); - case AMDGPU::EXTRACT_SUBREG: { - int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - const TargetRegisterClass *SuperClass = - getRegClassForNode(DAG, Op.getOperand(0)); - return TRI.getSubClassWithSubReg(SuperClass, SubIdx); - } - case AMDGPU::REG_SEQUENCE: - // Operand 0 is the register class id for REG_SEQUENCE instructions. - return TRI.getRegClass( - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); - default: - return getRegClassFor(Op.getSimpleValueType()); - } -} + if (Node->getValueType(0) == MVT::f32) + return FloatToBits(Node->getValueAPF().convertToFloat()); -/// \brief Does "Op" fit into register class "RegClass" ? -bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, - unsigned RegClass) const { - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); - if (!RC) { - return false; + return -1; } - return TRI->getRegClass(RegClass)->hasSubClassEq(RC); -} -/// \returns true if \p Node's operands are different from the SDValue list -/// \p Ops -static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { - for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { - if (Ops[i].getNode() != Node->getOperand(i).getNode()) { - return true; - } - } - return false; -} - -/// TODO: This needs to be removed. It's current primary purpose is to fold -/// immediates into operands when legal. The legalization parts are redundant -/// with SIInstrInfo::legalizeOperands which is called in a post-isel hook. -SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node, - SelectionDAG &DAG) const { - // Original encoding (either e32 or e64) - int Opcode = Node->getMachineOpcode(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - const MCInstrDesc *Desc = &TII->get(Opcode); - - unsigned NumDefs = Desc->getNumDefs(); - unsigned NumOps = Desc->getNumOperands(); - - // Commuted opcode if available - int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; - const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); - - assert(!DescRev || DescRev->getNumDefs() == NumDefs); - assert(!DescRev || DescRev->getNumOperands() == NumOps); - - int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; - bool HaveVSrc = false, HaveSSrc = false; - - // First figure out what we already have in this instruction. - for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; - i != e && Op < NumOps; ++i, ++Op) { - - unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (isVSrc(RegClass)) - HaveVSrc = true; - else if (isSSrc(RegClass)) - HaveSSrc = true; - else - continue; - - int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); - if (Imm != -1 && Imm != 0) { - // Literal immediate - Immediate = Imm; - } - } - - // If we neither have VSrc nor SSrc, it makes no sense to continue. - if (!HaveVSrc && !HaveSSrc) - return Node; - - // No scalar allowed when we have both VSrc and SSrc - bool ScalarSlotUsed = HaveVSrc && HaveSSrc; - - // If this instruction has an implicit use of VCC, then it can't use the - // constant bus. - for (unsigned i = 0, e = Desc->getNumImplicitUses(); i != e; ++i) { - if (Desc->ImplicitUses[i] == AMDGPU::VCC) { - ScalarSlotUsed = true; - break; - } - } - - // Second go over the operands and try to fold them - std::vector<SDValue> Ops; - for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; - i != e && Op < NumOps; ++i, ++Op) { - - const SDValue &Operand = Node->getOperand(i); - Ops.push_back(Operand); - - // Already folded immediate? - if (isa<ConstantSDNode>(Operand.getNode()) || - isa<ConstantFPSDNode>(Operand.getNode())) - continue; - - // Is this a VSrc or SSrc operand? - unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (isVSrc(RegClass) || isSSrc(RegClass)) { - // Try to fold the immediates. If this ends up with multiple constant bus - // uses, it will be legalized later. - foldImm(Ops[i], Immediate, ScalarSlotUsed); - continue; - } - - if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { - - unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; - assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); - - // Test if it makes sense to swap operands - if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || - (!fitsRegClass(DAG, Ops[1], RegClass) && - fitsRegClass(DAG, Ops[1], OtherRegClass))) { - - // Swap commutable operands - std::swap(Ops[0], Ops[1]); - - Desc = DescRev; - DescRev = nullptr; - continue; - } - } - } - - // Add optional chain and glue - for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) - Ops.push_back(Node->getOperand(i)); - - // Nodes that have a glue result are not CSE'd by getMachineNode(), so in - // this case a brand new node is always be created, even if the operands - // are the same as before. So, manually check if anything has been changed. - if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { - return Node; - } - - // Create a complete new instruction - return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); + return -1; } /// \brief Helper function for adjustWritemask @@ -1904,14 +1839,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Adjust the writemask in the node std::vector<SDValue> Ops; Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); - for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) - Ops.push_back(Node->getOperand(i)); + Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy // (if NewDmask has only one bit set...) if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Users[Lane]->getValueType(0), SDValue(Node, 0), RC); @@ -1963,9 +1897,8 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - Node = AdjustRegClass(Node, DAG); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); @@ -1975,17 +1908,17 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, legalizeTargetIndependentNode(Node, DAG); return Node; } - - return legalizeOperands(Node, DAG); + return Node; } /// \brief Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); TII->legalizeOperands(MI); if (TII->isMIMG(MI->getOpcode())) { @@ -1998,14 +1931,13 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, const TargetRegisterClass *RC; switch (BitsSet) { default: return; - case 1: RC = &AMDGPU::VReg_32RegClass; break; + case 1: RC = &AMDGPU::VGPR_32RegClass; break; case 2: RC = &AMDGPU::VReg_64RegClass; break; case 3: RC = &AMDGPU::VReg_96RegClass; break; } unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); MI->setDesc(TII->get(NewOpcode)); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MRI.setRegClass(VReg, RC); return; } @@ -2030,6 +1962,8 @@ static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); #if 1 // XXX - Workaround for moveToVALU not handling different register class // inserts for REG_SEQUENCE. @@ -2039,7 +1973,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32), buildSMovImm32(DAG, DL, 0), DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), - buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), DAG.getTargetConstant(AMDGPU::sub1, MVT::i32) }; @@ -2063,7 +1997,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), buildSMovImm32(DAG, DL, 0), DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) }; @@ -2110,57 +2044,14 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size return buildRSRC(DAG, DL, Ptr, 0, Rsrc); } -MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, - SelectionDAG &DAG) const { - - SDLoc DL(N); - unsigned NewOpcode = N->getMachineOpcode(); - - switch (N->getMachineOpcode()) { - default: return N; - case AMDGPU::S_LOAD_DWORD_IMM: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - // Fall-through - case AMDGPU::S_LOAD_DWORDX2_SGPR: - if (NewOpcode == N->getMachineOpcode()) { - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - } - // Fall-through - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { - if (NewOpcode == N->getMachineOpcode()) { - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - } - if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { - return N; - } - ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); - - const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64); - SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0); - MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr); - - SmallVector<SDValue, 8> Ops; - Ops.push_back(SDValue(RSrc, 0)); - Ops.push_back(N->getOperand(0)); - Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)); - - // Copy remaining operands so we keep any chain and glue nodes that follow - // the normal operands. - for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I) - Ops.push_back(N->getOperand(I)); - - return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); - } - } -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 7bf406e..92f5847 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -42,27 +42,22 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - bool foldImm(SDValue &Operand, int32_t &Immediate, - bool &ScalarSlotUsed) const; - const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG, - const SDValue &Op) const; - bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op, - unsigned RegClass) const; - - SDNode *legalizeOperands(MachineSDNode *N, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; - MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; - static SDValue performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI); + SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const; SDValue performSHLPtrCombine(SDNode *N, unsigned AS, DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; public: - SITargetLowering(TargetMachine &tm); + SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, EVT /*VT*/) const override; @@ -94,6 +89,7 @@ public: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; + bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; MVT getScalarShiftAmountTy(EVT VT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 712d97d..50f20ac 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -41,6 +41,12 @@ typedef union { } Counters; +typedef enum { + OTHER, + SMEM, + VMEM +} InstType; + typedef Counters RegCounters[512]; typedef std::pair<unsigned, unsigned> RegInterval; @@ -73,6 +79,11 @@ private: /// \brief Different export instruction types seen since last wait. unsigned ExpInstrTypesSeen; + /// \brief Type of the last opcode. + InstType LastOpcodeType; + + bool LastInstWritesM0; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -83,7 +94,8 @@ private: RegInterval getRegInterval(MachineOperand &Op); /// \brief Handle instructions async components - void pushInstruction(MachineInstr &MI); + void pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); /// \brief Insert the actual wait instruction bool insertWait(MachineBasicBlock &MBB, @@ -96,6 +108,9 @@ private: /// \brief Resolve all operand dependencies to counter requirements Counters handleOperands(MachineInstr &MI); + /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. + void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + public: SIInsertWaits(TargetMachine &tm) : MachineFunctionPass(ID), @@ -176,6 +191,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { if (!MI.getDesc().mayStore()) return false; + // Check if this operand is the value being stored. + // Special case for DS instructions, since the address + // operand comes before the value operand and it may have + // multiple data operands. + + if (TII->isDS(MI.getOpcode())) { + MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); + if (Data && Op.isIdenticalTo(*Data)) + return true; + + MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + if (Data0 && Op.isIdenticalTo(*Data0)) + return true; + + MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data1 && Op.isIdenticalTo(*Data1)) + return true; + + return false; + } + + // NOTE: This assumes that the value operand is before the + // address operand, and that there is only one value operand. for (MachineInstr::mop_iterator I = MI.operands_begin(), E = MI.operands_end(); I != E; ++I) { @@ -203,10 +241,11 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { return Result; } -void SIInsertWaits::pushInstruction(MachineInstr &MI) { +void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(MI); + Counters Increment = getHwCounts(*I); unsigned Sum = 0; for (unsigned i = 0; i < 3; ++i) { @@ -215,17 +254,43 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) { } // If we don't increase anything then that's it - if (Sum == 0) + if (Sum == 0) { + LastOpcodeType = OTHER; return; + } + + if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + (LastOpcodeType == VMEM && Increment.Named.VM)) { + // Insert a NOP to break the clause. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + LastInstWritesM0 = false; + } + + if (TII->isSMRD(I->getOpcode())) + LastOpcodeType = SMEM; + else if (Increment.Named.VM) + LastOpcodeType = VMEM; + } // Remember which export instructions we have seen if (Increment.Named.EXP) { - ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2; + ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; } - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); + MachineOperand &Op = I->getOperand(i); if (!isOpRelevant(Op)) continue; @@ -302,6 +367,8 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, ((Counts.Named.EXP & 0x7) << 4) | ((Counts.Named.LGKM & 0x7) << 8)); + LastOpcodeType = OTHER; + LastInstWritesM0 = false; return true; } @@ -343,6 +410,30 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return; + + // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. + if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + LastInstWritesM0 = false; + return; + } + + // Set whether this instruction sets M0 + LastInstWritesM0 = false; + + unsigned NumOperands = I->getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + const MachineOperand &Op = I->getOperand(i); + + if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) + LastInstWritesM0 = true; + } +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -356,6 +447,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { WaitedOn = ZeroCounts; LastIssued = ZeroCounts; + LastOpcodeType = OTHER; + LastInstWritesM0 = false; memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); @@ -367,8 +460,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { - Changes |= insertWait(MBB, I, handleOperands(*I)); - pushInstruction(*I); + // Wait for everything before a barrier. + if (I->getOpcode() == AMDGPU::S_BARRIER) + Changes |= insertWait(MBB, I, LastIssued); + else + Changes |= insertWait(MBB, I, handleOperands(*I)); + + pushInstruction(MBB, I); + handleSendMsg(MBB, I); } // Wait for everything at the end of the MBB diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 10e0a3f..c90c741 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -17,65 +17,109 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> VM_CNT = 0; field bits<1> EXP_CNT = 0; field bits<1> LGKM_CNT = 0; - field bits<1> MIMG = 0; - field bits<1> SMRD = 0; + + field bits<1> SALU = 0; + field bits<1> VALU = 0; + + field bits<1> SOP1 = 0; + field bits<1> SOP2 = 0; + field bits<1> SOPC = 0; + field bits<1> SOPK = 0; + field bits<1> SOPP = 0; + field bits<1> VOP1 = 0; field bits<1> VOP2 = 0; field bits<1> VOP3 = 0; field bits<1> VOPC = 0; - field bits<1> SALU = 0; + field bits<1> MUBUF = 0; field bits<1> MTBUF = 0; + field bits<1> SMRD = 0; + field bits<1> DS = 0; + field bits<1> MIMG = 0; field bits<1> FLAT = 0; + field bits<1> WQM = 0; // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; let TSFlags{2} = LGKM_CNT; - let TSFlags{3} = MIMG; - let TSFlags{4} = SMRD; - let TSFlags{5} = VOP1; - let TSFlags{6} = VOP2; - let TSFlags{7} = VOP3; - let TSFlags{8} = VOPC; - let TSFlags{9} = SALU; - let TSFlags{10} = MUBUF; - let TSFlags{11} = MTBUF; - let TSFlags{12} = FLAT; + + let TSFlags{3} = SALU; + let TSFlags{4} = VALU; + + let TSFlags{5} = SOP1; + let TSFlags{6} = SOP2; + let TSFlags{7} = SOPC; + let TSFlags{8} = SOPK; + let TSFlags{9} = SOPP; + + let TSFlags{10} = VOP1; + let TSFlags{11} = VOP2; + let TSFlags{12} = VOP3; + let TSFlags{13} = VOPC; + + let TSFlags{14} = MUBUF; + let TSFlags{15} = MTBUF; + let TSFlags{16} = SMRD; + let TSFlags{17} = DS; + let TSFlags{18} = MIMG; + let TSFlags{19} = FLAT; + let TSFlags{20} = WQM; // Most instructions require adjustments after selection to satisfy // operand requirements. let hasPostISelHook = 1; + let SchedRW = [Write32Bit]; } class Enc32 { - field bits<32> Inst; int Size = 4; } class Enc64 { - field bits<64> Inst; int Size = 8; } -class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : +let Uses = [EXEC] in { + +class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { + let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let UseNamedOperandTable = 1; + let VALU = 1; +} + +class VOPCCommon <dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> { + + let DisableEncoding = "$dst"; + let VOPC = 1; + let Size = 4; +} + +class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + let VOP1 = 1; + let Size = 4; +} + +class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <outs, ins, asm, pattern> { + + let VOP2 = 1; + let Size = 4; } class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { + VOPAnyCommon <outs, ins, asm, pattern> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; // Using complex patterns gives VOP3 patterns a very high complexity rating, // but standalone patterns are almost always prefered, so we need to adjust the // priority lower. The goal is to use a high number to reduce complexity to @@ -83,63 +127,58 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let AddedComplexity = -1000; let VOP3 = 1; - int Size = 8; - let Uses = [EXEC]; } +} // End Uses = [EXEC] + //===----------------------------------------------------------------------===// // Scalar operations //===----------------------------------------------------------------------===// class SOP1e <bits<8> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; - bits<7> SDST; - bits<8> SSRC0; - - let Inst{7-0} = SSRC0; + let Inst{7-0} = ssrc0; let Inst{15-8} = op; - let Inst{22-16} = SDST; + let Inst{22-16} = sdst; let Inst{31-23} = 0x17d; //encoding; } class SOP2e <bits<7> op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + bits<8> ssrc1; - bits<7> SDST; - bits<8> SSRC0; - bits<8> SSRC1; - - let Inst{7-0} = SSRC0; - let Inst{15-8} = SSRC1; - let Inst{22-16} = SDST; + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = sdst; let Inst{29-23} = op; let Inst{31-30} = 0x2; // encoding } class SOPCe <bits<7> op> : Enc32 { + bits<8> ssrc0; + bits<8> ssrc1; - bits<8> SSRC0; - bits<8> SSRC1; - - let Inst{7-0} = SSRC0; - let Inst{15-8} = SSRC1; + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; let Inst{22-16} = op; let Inst{31-23} = 0x17e; } class SOPKe <bits<5> op> : Enc32 { + bits <7> sdst; + bits <16> simm16; - bits <7> SDST; - bits <16> SIMM16; - - let Inst{15-0} = SIMM16; - let Inst{22-16} = SDST; + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; let Inst{27-23} = op; let Inst{31-28} = 0xb; //encoding } class SOPPe <bits<7> op> : Enc32 { - bits <16> simm16; let Inst{15-0} = simm16; @@ -148,35 +187,36 @@ class SOPPe <bits<7> op> : Enc32 { } class SMRDe <bits<5> op, bits<1> imm> : Enc32 { + bits<7> sdst; + bits<7> sbase; + bits<8> offset; - bits<7> SDST; - bits<7> SBASE; - bits<8> OFFSET; - - let Inst{7-0} = OFFSET; + let Inst{7-0} = offset; let Inst{8} = imm; - let Inst{14-9} = SBASE{6-1}; - let Inst{21-15} = SDST; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; let Inst{26-22} = op; let Inst{31-27} = 0x18; //encoding } -class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, SOP1e <op> { - +let SchedRW = [WriteSALU] in { +class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOP1 = 1; } -class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, SOP2e<op> { +class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOP2 = 1; let UseNamedOperandTable = 1; } @@ -189,17 +229,19 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOPC = 1; let UseNamedOperandTable = 1; } -class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins , asm, pattern>, SOPKe<op> { +class SOPK <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins , asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOPK = 1; let UseNamedOperandTable = 1; } @@ -210,12 +252,14 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let isCodeGenOnly = 0; let SALU = 1; + let SOPP = 1; let UseNamedOperandTable = 1; } +} // let SchedRW = [WriteSALU] + class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern> { @@ -225,6 +269,7 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : let mayLoad = 1; let hasSideEffects = 0; let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; } //===----------------------------------------------------------------------===// @@ -232,32 +277,44 @@ class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : //===----------------------------------------------------------------------===// class VOP1e <bits<8> op> : Enc32 { + bits<8> vdst; + bits<9> src0; - bits<8> VDST; - bits<9> SRC0; - - let Inst{8-0} = SRC0; + let Inst{8-0} = src0; let Inst{16-9} = op; - let Inst{24-17} = VDST; + let Inst{24-17} = vdst; let Inst{31-25} = 0x3f; //encoding } class VOP2e <bits<6> op> : Enc32 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; - bits<8> VDST; - bits<9> SRC0; - bits<8> VSRC1; - - let Inst{8-0} = SRC0; - let Inst{16-9} = VSRC1; - let Inst{24-17} = VDST; + let Inst{8-0} = src0; + let Inst{16-9} = src1; + let Inst{24-17} = vdst; let Inst{30-25} = op; let Inst{31} = 0x0; //encoding } -class VOP3e <bits<9> op> : Enc64 { +class VOP2_MADKe <bits<6> op> : Enc64 { + + bits<8> vdst; + bits<9> src0; + bits<8> vsrc1; + bits<32> src2; - bits<8> dst; + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63-32} = src2; +} + +class VOP3e <bits<9> op> : Enc64 { + bits<8> vdst; bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -267,7 +324,7 @@ class VOP3e <bits<9> op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = dst; + let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; @@ -284,8 +341,7 @@ class VOP3e <bits<9> op> : Enc64 { } class VOP3be <bits<9> op> : Enc64 { - - bits<8> dst; + bits<8> vdst; bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -295,7 +351,7 @@ class VOP3be <bits<9> op> : Enc64 { bits<7> sdst; bits<2> omod; - let Inst{7-0} = dst; + let Inst{7-0} = vdst; let Inst{14-8} = sdst; let Inst{25-17} = op; let Inst{31-26} = 0x34; //encoding @@ -309,33 +365,30 @@ class VOP3be <bits<9> op> : Enc64 { } class VOPCe <bits<8> op> : Enc32 { + bits<9> src0; + bits<8> vsrc1; - bits<9> SRC0; - bits<8> VSRC1; - - let Inst{8-0} = SRC0; - let Inst{16-9} = VSRC1; + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; let Inst{24-17} = op; let Inst{31-25} = 0x3e; } class VINTRPe <bits<2> op> : Enc32 { + bits<8> vdst; + bits<8> vsrc; + bits<2> attrchan; + bits<6> attr; - bits<8> VDST; - bits<8> VSRC; - bits<2> ATTRCHAN; - bits<6> ATTR; - - let Inst{7-0} = VSRC; - let Inst{9-8} = ATTRCHAN; - let Inst{15-10} = ATTR; + let Inst{7-0} = vsrc; + let Inst{9-8} = attrchan; + let Inst{15-10} = attr; let Inst{17-16} = op; - let Inst{25-18} = VDST; + let Inst{25-18} = vdst; let Inst{31-26} = 0x32; // encoding } class DSe <bits<8> op> : Enc64 { - bits<8> vdst; bits<1> gds; bits<8> addr; @@ -356,7 +409,6 @@ class DSe <bits<8> op> : Enc64 { } class MUBUFe <bits<7> op> : Enc64 { - bits<12> offset; bits<1> offen; bits<1> idxen; @@ -387,67 +439,65 @@ class MUBUFe <bits<7> op> : Enc64 { } class MTBUFe <bits<3> op> : Enc64 { + bits<8> vdata; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; - bits<8> VDATA; - bits<12> OFFSET; - bits<1> OFFEN; - bits<1> IDXEN; - bits<1> GLC; - bits<1> ADDR64; - bits<4> DFMT; - bits<3> NFMT; - bits<8> VADDR; - bits<7> SRSRC; - bits<1> SLC; - bits<1> TFE; - bits<8> SOFFSET; - - let Inst{11-0} = OFFSET; - let Inst{12} = OFFEN; - let Inst{13} = IDXEN; - let Inst{14} = GLC; - let Inst{15} = ADDR64; + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; let Inst{18-16} = op; - let Inst{22-19} = DFMT; - let Inst{25-23} = NFMT; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = VADDR; - let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC{6-2}; - let Inst{54} = SLC; - let Inst{55} = TFE; - let Inst{63-56} = SOFFSET; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; } class MIMGe <bits<7> op> : Enc64 { - - bits<8> VDATA; - bits<4> DMASK; - bits<1> UNORM; - bits<1> GLC; - bits<1> DA; - bits<1> R128; - bits<1> TFE; - bits<1> LWE; - bits<1> SLC; - bits<8> VADDR; - bits<7> SRSRC; - bits<7> SSAMP; - - let Inst{11-8} = DMASK; - let Inst{12} = UNORM; - let Inst{13} = GLC; - let Inst{14} = DA; - let Inst{15} = R128; - let Inst{16} = TFE; - let Inst{17} = LWE; + bits<8> vdata; + bits<4> dmask; + bits<1> unorm; + bits<1> glc; + bits<1> da; + bits<1> r128; + bits<1> tfe; + bits<1> lwe; + bits<1> slc; + bits<8> vaddr; + bits<7> srsrc; + bits<7> ssamp; + + let Inst{11-8} = dmask; + let Inst{12} = unorm; + let Inst{13} = glc; + let Inst{14} = da; + let Inst{15} = r128; + let Inst{16} = tfe; + let Inst{17} = lwe; let Inst{24-18} = op; - let Inst{25} = SLC; + let Inst{25} = slc; let Inst{31-26} = 0x3c; - let Inst{39-32} = VADDR; - let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC{6-2}; - let Inst{57-53} = SSAMP{6-2}; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{57-53} = ssamp{6-2}; } class FLATe<bits<7> op> : Enc64 { @@ -471,26 +521,26 @@ class FLATe<bits<7> op> : Enc64 { } class EXPe : Enc64 { - bits<4> EN; - bits<6> TGT; - bits<1> COMPR; - bits<1> DONE; - bits<1> VM; - bits<8> VSRC0; - bits<8> VSRC1; - bits<8> VSRC2; - bits<8> VSRC3; - - let Inst{3-0} = EN; - let Inst{9-4} = TGT; - let Inst{10} = COMPR; - let Inst{11} = DONE; - let Inst{12} = VM; + bits<4> en; + bits<6> tgt; + bits<1> compr; + bits<1> done; + bits<1> vm; + bits<8> vsrc0; + bits<8> vsrc1; + bits<8> vsrc2; + bits<8> vsrc3; + + let Inst{3-0} = en; + let Inst{9-4} = tgt; + let Inst{10} = compr; + let Inst{11} = done; + let Inst{12} = vm; let Inst{31-26} = 0x3e; - let Inst{39-32} = VSRC0; - let Inst{47-40} = VSRC1; - let Inst{55-48} = VSRC2; - let Inst{63-56} = VSRC3; + let Inst{39-32} = vsrc0; + let Inst{47-40} = vsrc1; + let Inst{55-48} = vsrc2; + let Inst{63-56} = vsrc3; } let Uses = [EXEC] in { @@ -500,34 +550,13 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : VOP1e<op>; class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, VOP2e<op> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP2 = 1; -} - -class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP3Common <outs, ins, asm, pattern>, VOP3e<op>; - -class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP3Common <outs, ins, asm, pattern>, VOP3be<op>; + VOP2Common <outs, ins, asm, pattern>, VOP2e<op>; class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : - InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> { - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOPC = 1; -} + VOPCCommon <ins, asm, pattern>, VOPCe <op>; -class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, VINTRPe<op> { +class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -541,15 +570,18 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : let Uses = [EXEC] in { -class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> , DSe<op> { +class DS <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let LGKM_CNT = 1; + let DS = 1; let UseNamedOperandTable = 1; + let DisableEncoding = "$m0"; + let SchedRW = [WriteLDS]; } -class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, MUBUFe <op> { +class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let VM_CNT = 1; let EXP_CNT = 1; @@ -557,6 +589,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let hasSideEffects = 0; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : @@ -566,8 +599,9 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : let EXP_CNT = 1; let MTBUF = 1; - let neverHasSideEffects = 1; + let hasSideEffects = 0; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : @@ -596,5 +630,4 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : } - } // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 8343362..4f1e5ad 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -28,8 +28,7 @@ using namespace llvm; SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), - RI(st) { } + : AMDGPUInstrInfo(st), RI(st) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -326,26 +325,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; const int16_t *SubIndices; - if (AMDGPU::M0 == DestReg) { - // Check if M0 isn't already set to this value - for (MachineBasicBlock::reverse_iterator E = MBB.rend(), - I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) { - - if (!I->definesRegister(AMDGPU::M0)) - continue; - - unsigned Opc = I->getOpcode(); - if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32) - break; - - if (!I->readsRegister(SrcReg)) - break; - - // The copy isn't necessary - return; - } - } - if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) @@ -353,6 +332,21 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (DestReg == AMDGPU::VCC) { + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -373,8 +367,8 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = AMDGPU::S_MOV_B32; SubIndices = Sub0_15; - } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_32RegClass.contains(SrcReg) || + } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -428,27 +422,30 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { int NewOpc; // Try to map original to commuted opcode - if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1) + NewOpc = AMDGPU::getCommuteRev(Opcode); + // Check if the commuted (REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) return NewOpc; // Try to map commuted to original opcode - if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1) + NewOpc = AMDGPU::getCommuteOrig(Opcode); + // Check if the original (non-REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) return NewOpc; return Opcode; } -static bool shouldTryToSpillVGPRs(MachineFunction *MF) { - - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const TargetMachine &TM = MF->getTarget(); - - // FIXME: Even though it can cause problems, we need to enable - // spilling at -O0, since the fast register allocator always - // spills registers that are live at the end of blocks. - return MFI->getShaderType() == ShaderType::COMPUTE && - TM.getOptLevel() == CodeGenOpt::None; +unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + if (DstRC->getSize() == 4) { + return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + return AMDGPU::S_MOV_B64; + } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + return AMDGPU::V_MOV_B64_PSEUDO; + } + return AMDGPU::COPY; } void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -458,6 +455,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); int Opcode = -1; @@ -473,7 +471,9 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; } - } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { + MFI->setHasSpilledVGPRs(); + switch(RC->getSize() * 8) { case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; @@ -488,12 +488,16 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, FrameInfo->setObjectAlignment(FrameIndex, 4); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) - .addFrameIndex(FrameIndex); + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); } else { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) + BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) .addReg(SrcReg); } } @@ -504,6 +508,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); int Opcode = -1; @@ -516,7 +521,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; } - } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { switch(RC->getSize() * 8) { case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; @@ -530,13 +535,17 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (Opcode != -1) { FrameInfo->setObjectAlignment(FrameIndex, 4); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex); + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addReg(AMDGPU::VGPR0); + BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); } } @@ -548,7 +557,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); DebugLoc DL = MBB.findDebugLoc(MI); @@ -561,7 +570,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineBasicBlock::iterator Insert = Entry.front(); DebugLoc DL = Insert->getDebugLoc(); - TIDReg = RI.findUnusedVGPR(MF->getRegInfo()); + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); if (TIDReg == AMDGPU::NoRegister) return TIDReg; @@ -616,7 +625,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, .addImm(-1) .addImm(0); - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), TIDReg) .addImm(-1) .addReg(TIDReg); @@ -682,12 +691,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { // This is just a placeholder for register allocation. MI->eraseFromParent(); break; + + case AMDGPU::V_MOV_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + const MachineOperand &SrcOp = MI->getOperand(1); + // FIXME: Will this work for 64-bit floating point immediates? + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + } else { + assert(SrcOp.isReg()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit); + } + MI->eraseFromParent(); + break; + } } return true; } MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { + if (MI->getNumOperands() < 3) return nullptr; @@ -709,12 +748,13 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, // Make sure it's legal to commute operands for VOP2. if (isVOP2(MI->getOpcode()) && (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) + !isOperandLegal(MI, Src1Idx, &Src0))) { return nullptr; + } if (!Src1.isReg()) { - // Allow commuting instructions with Imm or FPImm operands. - if (NewMI || (!Src1.isImm() && !Src1.isFPImm()) || + // Allow commuting instructions with Imm operands. + if (NewMI || !Src1.isImm() || (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { return nullptr; } @@ -742,8 +782,6 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, unsigned SubReg = Src0.getSubReg(); if (Src1.isImm()) Src0.ChangeToImmediate(Src1.getImm()); - else if (Src1.isFPImm()) - Src0.ChangeToFPImmediate(Src1.getFPImm()); else llvm_unreachable("Should only have immediates"); @@ -821,6 +859,131 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { return RC != &AMDGPU::EXECRegRegClass; } +static void removeModOperands(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src0_modifiers); + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src1_modifiers); + int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2_modifiers); + + MI.RemoveOperand(Src2ModIdx); + MI.RemoveOperand(Src1ModIdx); + MI.RemoveOperand(Src0ModIdx); +} + +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned Opc = UseMI->getOpcode(); + if (Opc == AMDGPU::V_MAD_F32) { + // Don't fold if we are using source modifiers. The new VOP2 instructions + // don't have them. + if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + return false; + } + + MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + + // Multiplied part is the constant: Use v_madmk_f32 + // We should only expect these to be on src0 due to canonicalizations. + if (Src0->isReg() && Src0->getReg() == Reg) { + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + if (!Src2->isReg() || + (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + return false; + + // We need to do some weird looking operand shuffling since the madmk + // operands are out of the normal expected order with the multiplied + // constant as the last operand. + // + // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 + // src0 -> src2 K + // src1 -> src0 + // src2 -> src1 + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + unsigned Src1Reg = Src1->getReg(); + unsigned Src1SubReg = Src1->getSubReg(); + unsigned Src2Reg = Src2->getReg(); + unsigned Src2SubReg = Src2->getSubReg(); + Src0->setReg(Src1Reg); + Src0->setSubReg(Src1SubReg); + Src1->setReg(Src2Reg); + Src1->setSubReg(Src2SubReg); + + Src2->ChangeToImmediate(Imm); + + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + + // Added part is the constant: Use v_madak_f32 + if (Src2->isReg() && Src2->getReg() == Reg) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + if (!Src0->isImm() && + (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + return false; + + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + Src2->ChangeToImmediate(Imm); + + // These come before src2. + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + } + + return false; +} + bool SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA) const { @@ -915,63 +1078,24 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, return false; } -namespace llvm { -namespace AMDGPU { -// Helper function generated by tablegen. We are wrapping this with -// an SIInstrInfo function that returns bool rather than int. -int isDS(uint16_t Opcode); -} -} - -bool SIInstrInfo::isDS(uint16_t Opcode) const { - return ::AMDGPU::isDS(Opcode) != -1; -} - -bool SIInstrInfo::isMIMG(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MIMG; -} - -bool SIInstrInfo::isSMRD(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SMRD; -} - -bool SIInstrInfo::isMUBUF(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MUBUF; -} - -bool SIInstrInfo::isMTBUF(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MTBUF; -} - -bool SIInstrInfo::isFLAT(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::FLAT; -} - -bool SIInstrInfo::isVOP1(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP1; -} - -bool SIInstrInfo::isVOP2(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP2; -} - -bool SIInstrInfo::isVOP3(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP3; -} - -bool SIInstrInfo::isVOPC(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOPC; -} - -bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU; -} - bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { - int32_t Val = Imm.getSExtValue(); - if (Val >= -16 && Val <= 64) + int64_t SVal = Imm.getSExtValue(); + if (SVal >= -16 && SVal <= 64) return true; + if (Imm.getBitWidth() == 64) { + uint64_t Val = Imm.getZExtValue(); + return (DoubleToBits(0.0) == Val) || + (DoubleToBits(1.0) == Val) || + (DoubleToBits(-1.0) == Val) || + (DoubleToBits(0.5) == Val) || + (DoubleToBits(-0.5) == Val) || + (DoubleToBits(2.0) == Val) || + (DoubleToBits(-2.0) == Val) || + (DoubleToBits(4.0) == Val) || + (DoubleToBits(-4.0) == Val); + } + // The actual type of the operand does not seem to matter as long // as the bits match one of the inline immediate values. For example: // @@ -980,32 +1104,38 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { // // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in // floating-point, so it is a legal inline immediate. - - return (APInt::floatToBits(0.0f) == Imm) || - (APInt::floatToBits(1.0f) == Imm) || - (APInt::floatToBits(-1.0f) == Imm) || - (APInt::floatToBits(0.5f) == Imm) || - (APInt::floatToBits(-0.5f) == Imm) || - (APInt::floatToBits(2.0f) == Imm) || - (APInt::floatToBits(-2.0f) == Imm) || - (APInt::floatToBits(4.0f) == Imm) || - (APInt::floatToBits(-4.0f) == Imm); -} - -bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const { - if (MO.isImm()) - return isInlineConstant(APInt(32, MO.getImm(), true)); - - if (MO.isFPImm()) { - APFloat FpImm = MO.getFPImm()->getValueAPF(); - return isInlineConstant(FpImm.bitcastToAPInt()); + uint32_t Val = Imm.getZExtValue(); + + return (FloatToBits(0.0f) == Val) || + (FloatToBits(1.0f) == Val) || + (FloatToBits(-1.0f) == Val) || + (FloatToBits(0.5f) == Val) || + (FloatToBits(-0.5f) == Val) || + (FloatToBits(2.0f) == Val) || + (FloatToBits(-2.0f) == Val) || + (FloatToBits(4.0f) == Val) || + (FloatToBits(-4.0f) == Val); +} + +bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, + unsigned OpSize) const { + if (MO.isImm()) { + // MachineOperand provides no way to tell the true operand size, since it + // only records a 64-bit value. We need to know the size to determine if a + // 32-bit floating point immediate bit pattern is legal for an integer + // immediate. It would be for any 32-bit integer operand, but would not be + // for a 64-bit one. + + unsigned BitSize = 8 * OpSize; + return isInlineConstant(APInt(BitSize, MO.getImm(), true)); } return false; } -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const { - return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO); +bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, + unsigned OpSize) const { + return MO.isImm() && !isInlineConstant(MO, OpSize); } static bool compareMachineOp(const MachineOperand &Op0, @@ -1018,8 +1148,6 @@ static bool compareMachineOp(const MachineOperand &Op0, return Op0.getReg() == Op1.getReg(); case MachineOperand::MO_Immediate: return Op0.getImm() == Op1.getImm(); - case MachineOperand::MO_FPImmediate: - return Op0.getFPImm() == Op1.getFPImm(); default: llvm_unreachable("Didn't expect to be comparing these operand types"); } @@ -1029,7 +1157,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO) const { const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; - assert(MO.isImm() || MO.isFPImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; @@ -1037,21 +1165,26 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - if (isLiteralConstant(MO)) - return RI.regClassCanUseLiteralConstant(OpInfo.RegClass); + unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); + if (isLiteralConstant(MO, OpSize)) + return RI.opCanUseLiteralConstant(OpInfo.OperandType); - return RI.regClassCanUseInlineConstant(OpInfo.RegClass); + return RI.opCanUseInlineConstant(OpInfo.OperandType); } -bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) { +bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const { switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { // MUBUF instructions a 12-bit offset in bytes. return isUInt<12>(OffsetSize); } case AMDGPUAS::CONSTANT_ADDRESS: { - // SMRD instructions have an 8-bit offset in dwords. - return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); } case AMDGPUAS::LOCAL_ADDRESS: case AMDGPUAS::REGION_ADDRESS: { @@ -1066,7 +1199,11 @@ bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) { } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { - return AMDGPU::getVOPe32(Opcode) != -1; + int Op32 = AMDGPU::getVOPe32(Opcode); + if (Op32 == -1) + return false; + + return pseudoToMCOpcode(Op32) != -1; } bool SIInstrInfo::hasModifiers(unsigned Opcode) const { @@ -1084,9 +1221,10 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, } bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO) const { + const MachineOperand &MO, + unsigned OpSize) const { // Literal constants use the constant bus. - if (isLiteralConstant(MO)) + if (isLiteralConstant(MO, OpSize)) return true; if (!MO.isReg() || !MO.isUse()) @@ -1132,21 +1270,35 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the register classes are correct for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isFPImm()) { + ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " + "all fp values to integers."; + return false; + } + + int RegClass = Desc.OpInfo[i].RegClass; + switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: { - if ((MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) && - !isImmOperandLegal(MI, i, MI->getOperand(i))) { - ErrInfo = "Illegal immediate value for operand."; - return false; - } + case MCOI::OPERAND_REGISTER: + if (MI->getOperand(i).isImm()) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case AMDGPU::OPERAND_REG_IMM32: + break; + case AMDGPU::OPERAND_REG_INLINE_C: + if (isLiteralConstant(MI->getOperand(i), + RI.getRegClass(RegClass)->getSize())) { + ErrInfo = "Illegal immediate value for operand."; + return false; } break; case MCOI::OPERAND_IMMEDIATE: // Check if this operand is an immediate. // FrameIndex operands will be replaced by immediates, so they are // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() && - !MI->getOperand(i).isFI()) { + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -1158,7 +1310,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, if (!MI->getOperand(i).isReg()) continue; - int RegClass = Desc.OpInfo[i].RegClass; if (RegClass != -1) { unsigned Reg = MI->getOperand(i).getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) @@ -1175,11 +1326,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify VOP* if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + unsigned ConstantBusCount = 0; unsigned SGPRUsed = AMDGPU::NoRegister; - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (usesConstantBus(MRI, MO)) { + for (int OpIdx : OpIndices) { + if (OpIdx == -1) + break; + const MachineOperand &MO = MI->getOperand(OpIdx); + if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) ++ConstantBusCount; @@ -1195,31 +1353,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } - // Verify SRC1 for VOP2 and VOPC - if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) { - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - if (Src1.isImm() || Src1.isFPImm()) { - ErrInfo = "VOP[2C] src1 cannot be an immediate."; - return false; - } - } - - // Verify VOP3 - if (isVOP3(Opcode)) { - if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) { - ErrInfo = "VOP3 src0 cannot be a literal constant."; - return false; - } - if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) { - ErrInfo = "VOP3 src1 cannot be a literal constant."; - return false; - } - if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) { - ErrInfo = "VOP3 src2 cannot be a literal constant."; - return false; - } - } - // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { @@ -1287,7 +1420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; } @@ -1302,8 +1435,13 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); const MCInstrDesc &Desc = get(MI.getOpcode()); if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || - Desc.OpInfo[OpNo].RegClass == -1) - return MRI.getRegClass(MI.getOperand(OpNo).getReg()); + Desc.OpInfo[OpNo].RegClass == -1) { + unsigned Reg = MI.getOperand(OpNo).getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + return RI.getPhysRegClass(Reg); + } unsigned RCID = Desc.OpInfo[OpNo].RegClass; return RI.getRegClass(RCID); @@ -1339,7 +1477,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) VRC = &AMDGPU::VReg_64RegClass; else - VRC = &AMDGPU::VReg_32RegClass; + VRC = &AMDGPU::VGPR_32RegClass; unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); @@ -1428,6 +1566,14 @@ unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, return Dst; } +// Change the order of operands from (0, 1, 2) to (0, 2, 1) +void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { + assert(Inst->getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst->getOperand(1); + Inst->RemoveOperand(1); + Inst->addOperand(Op1); +} + bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -1438,14 +1584,16 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (!MO) MO = &MI->getOperand(OpIdx); - if (usesConstantBus(MRI, *MO)) { + if (isVALU(InstDesc.Opcode) && + usesConstantBus(MRI, *MO, DefinedRC->getSize())) { unsigned SGPRUsed = MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; - if (usesConstantBus(MRI, MI->getOperand(i)) && - MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) { + const MachineOperand &Op = MI->getOperand(i); + if (Op.isReg() && Op.getReg() != SGPRUsed && + usesConstantBus(MRI, Op, getOpSize(*MI, i))) { return false; } } @@ -1463,12 +1611,13 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, // // s_sendmsg 0, s0 ; Operand defined as m0reg // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; } // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isFPImm() || MO->isTargetIndex() || MO->isFI()); + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); if (!DefinedRC) { // This operand expects an immediate. @@ -1537,7 +1686,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // We can use one SGPR in each VOP3 instruction. continue; } - } else if (!isLiteralConstant(MO)) { + } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { // If it is not a register and not a literal constant, then it must be // an inline constant which is always legal. continue; @@ -1641,17 +1790,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // SRsrcPtrLo = srsrc:sub0 unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass); + &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); // SRsrcPtrHi = srsrc:sub1 unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass); + &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); // Zero64 = 0 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), @@ -1661,12 +1811,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) - .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); + .addImm(RsrcDataFormat & 0xFFFFFFFF); // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), @@ -1685,8 +1835,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), @@ -1709,9 +1859,6 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF " - "with non-zero soffset is not implemented"); - (void)SOffset; // Create the new instruction. unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); @@ -1722,6 +1869,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. + .addOperand(*SOffset) .addOperand(*Offset); MI->removeFromParent(); @@ -1764,27 +1912,30 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, getNamedOperand(*MI, AMDGPU::OpName::offset); const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes + // on VI. if (OffOp) { + bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + unsigned OffScale = isVI ? 1 : 4; // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm(); - unsigned HiOffset = LoOffset + (HalfSize / 4); + unsigned LoOffset = OffOp->getImm() * OffScale; + unsigned HiOffset = LoOffset + HalfSize; Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) .addOperand(*SBase) - .addImm(LoOffset); + .addImm(LoOffset / OffScale); - if (!isUInt<8>(HiOffset)) { + if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset << 2); // The immediate offset is in dwords, - // but offset in register is in bytes. + .addImm(HiOffset); // The offset in register is in bytes. Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) .addOperand(*SBase) .addReg(OffsetSGPR); } else { Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) .addOperand(*SBase) - .addImm(HiOffset); + .addImm(HiOffset / OffScale); } } else { // Handle the _SGPR variant @@ -1849,10 +2000,13 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con ImmOffset = 0; } else { assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets and MUBUF instructions - // take a byte offset. - ImmOffset = MI->getOperand(2).getImm() << 2; + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = MI->getOperand(2).getImm(); + if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + if (isUInt<12>(ImmOffset)) { BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), RegOffset) @@ -1870,13 +2024,14 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) .addImm(0); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); + .addImm(RsrcDataFormat & 0xFFFFFFFF); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + .addImm(RsrcDataFormat >> 32); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) .addReg(DWord0) .addImm(AMDGPU::sub0) @@ -1893,6 +2048,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); } MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); const TargetRegisterClass *NewDstRC = @@ -2001,6 +2157,43 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; } + case AMDGPU::S_LSHL_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHL_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B64; + swapOperands(Inst); + } + break; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2107,7 +2300,7 @@ unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, } const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VReg_32RegClass; + return &AMDGPU::VGPR_32RegClass; } void SIInstrInfo::splitScalar64BitUnaryOp( @@ -2237,7 +2430,7 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist MachineOperand &Dest = Inst->getOperand(0); MachineOperand &Src = Inst->getOperand(1); - const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32); + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); const TargetRegisterClass *SrcRC = Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; @@ -2419,7 +2612,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite( unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister( + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) @@ -2437,7 +2630,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead( unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister( + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) @@ -2459,7 +2652,7 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index)); + Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); @@ -2485,3 +2678,11 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, return &MI.getOperand(Idx); } + +uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; + if (ST.isAmdHsaOS()) + RsrcDataFormat |= (1ULL << 56); + + return RsrcDataFormat; +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 3bdbc9b..12dc3f3 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_R600_SIINSTRINFO_H #include "AMDGPUInstrInfo.h" +#include "SIDefines.h" #include "SIRegisterInfo.h" namespace llvm { @@ -44,6 +45,8 @@ private: const TargetRegisterClass *RC, const MachineOperand &Op) const; + void swapOperands(MachineBasicBlock::iterator Inst) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, unsigned Opcode) const; @@ -107,6 +110,10 @@ public: bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + // \brief Returns an opcode that can be used to move a value to a \p DstRC + // register. If there is no hardware instruction that can store to \p + // DstRC, then AMDGPU::COPY is returned. + unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; unsigned commuteOpcode(unsigned Opcode) const; MachineInstr *commuteInstruction(MachineInstr *MI, @@ -128,27 +135,92 @@ public: bool isMov(unsigned Opcode) const override; bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - bool isDS(uint16_t Opcode) const; - bool isMIMG(uint16_t Opcode) const; - bool isSMRD(uint16_t Opcode) const; - bool isMUBUF(uint16_t Opcode) const; - bool isMTBUF(uint16_t Opcode) const; - bool isFLAT(uint16_t Opcode) const; - bool isVOP1(uint16_t Opcode) const; - bool isVOP2(uint16_t Opcode) const; - bool isVOP3(uint16_t Opcode) const; - bool isVOPC(uint16_t Opcode) const; + + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const final; + + bool isSALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SALU; + } + + bool isVALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VALU; + } + + bool isSOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP1; + } + + bool isSOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP2; + } + + bool isSOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPC; + } + + bool isSOPK(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPK; + } + + bool isSOPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPP; + } + + bool isVOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP1; + } + + bool isVOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP2; + } + + bool isVOP3(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3; + } + + bool isVOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOPC; + } + + bool isMUBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MUBUF; + } + + bool isMTBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MTBUF; + } + + bool isSMRD(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SMRD; + } + + bool isDS(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DS; + } + + bool isMIMG(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MIMG; + } + + bool isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; + } + + bool isWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::WQM; + } bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const MachineOperand &MO) const; - bool isLiteralConstant(const MachineOperand &MO) const; + bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; + bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO) const; /// \brief Return true if the given offset Size in bytes can be folded into /// the immediate offsets of a memory instruction for the given address space. - static bool canFoldOffset(unsigned OffsetSize, unsigned AS) LLVM_READNONE; + bool canFoldOffset(unsigned OffsetSize, unsigned AS) const; /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. @@ -156,7 +228,8 @@ public: /// \brief Returns true if this operand uses the constant bus. bool usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO) const; + const MachineOperand &MO, + unsigned OpSize) const; /// \brief Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. @@ -168,7 +241,6 @@ public: bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const override; - bool isSALUInstr(const MachineInstr &MI) const; static unsigned getVALUOp(const MachineInstr &MI); bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; @@ -179,7 +251,27 @@ public: /// the register class of its machine operand. /// to infer the correct register class base on the other operands. const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, - unsigned OpNo) const;\ + unsigned OpNo) const; + + /// \brief Return the size in bytes of the operand OpNo on the given + // instruction opcode. + unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { + const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; + + if (OpInfo.RegClass == -1) { + // If this is an immediate operand, this must be a 32-bit literal. + assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); + return 4; + } + + return RI.getRegClass(OpInfo.RegClass)->getSize(); + } + + /// \brief This form should usually be preferred since it handles operands + /// with unknown register classes. + unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { + return getOpRegClass(MI, OpNo)->getSize(); + } /// \returns true if it is legal for the operand at index \p OpNo /// to read a VGPR. @@ -250,6 +342,9 @@ public: unsigned OpName) const { return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); } + + uint64_t getDefaultRsrcDataFormat() const; + }; namespace AMDGPU { @@ -258,7 +353,6 @@ namespace AMDGPU { int getVOPe32(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); - int getMCOpcode(uint16_t Opcode, unsigned Gen); int getAddr64Inst(uint16_t Opcode); int getAtomicRetOp(uint16_t Opcode); int getAtomicNoRetOp(uint16_t Opcode); diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 713e84e..e2747dc 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -9,35 +9,65 @@ class vop { field bits<9> SI3; + field bits<10> VI3; } -class vopc <bits<8> si> : vop { +class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop { field bits<8> SI = si; + field bits<8> VI = vi; - field bits<9> SI3 = {0, si{7-0}}; + field bits<9> SI3 = {0, si{7-0}}; + field bits<10> VI3 = {0, 0, vi{7-0}}; } -class vop1 <bits<8> si> : vop { - field bits<8> SI = si; +class vop1 <bits<8> si, bits<8> vi = si> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; - field bits<9> SI3 = {1, 1, si{6-0}}; + field bits<9> SI3 = {1, 1, si{6-0}}; + field bits<10> VI3 = !add(0x140, vi); } -class vop2 <bits<6> si> : vop { +class vop2 <bits<6> si, bits<6> vi = si> : vop { field bits<6> SI = si; + field bits<6> VI = vi; + + field bits<9> SI3 = {1, 0, 0, si{5-0}}; + field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; +} - field bits<9> SI3 = {1, 0, 0, si{5-0}}; +// Specify a VOP2 opcode for SI and VOP3 opcode for VI +// that doesn't have VOP2 encoding on VI +class vop23 <bits<6> si, bits<10> vi> : vop2 <si> { + let VI3 = vi; } -class vop3 <bits<9> si> : vop { - field bits<9> SI3 = si; +class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop { + let SI3 = si; + let VI3 = vi; +} + +class sop1 <bits<8> si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + +class sop2 <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class sopk <bits<5> si, bits<5> vi = si> { + field bits<5> SI = si; + field bits<5> VI = vi; } // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum -// in AMDGPUMCInstLower.h +// in AMDGPUInstrInfo.cpp def SISubtarget { int NONE = -1; int SI = 0; + int VI = 1; } //===----------------------------------------------------------------------===// @@ -131,6 +161,22 @@ def as_i32imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32); }]>; +def as_i64imm: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64); +}]>; + def IMM8bit : PatLeaf <(imm), [{return isUInt<8>(N->getZExtValue());}] >; @@ -143,6 +189,10 @@ def IMM16bit : PatLeaf <(imm), [{return isUInt<16>(N->getZExtValue());}] >; +def IMM20bit : PatLeaf <(imm), + [{return isUInt<20>(N->getZExtValue());}] +>; + def IMM32bit : PatLeaf <(imm), [{return isUInt<32>(N->getZExtValue());}] >; @@ -156,13 +206,16 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; +class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ + return isInlineImmediate(N); +}]>; + class SGPRImm <dag frag> : PatLeaf<frag, [{ - if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() < - AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { @@ -186,6 +239,7 @@ def sopp_brtarget : Operand<OtherVT> { } include "SIInstrFormats.td" +include "VIInstrFormats.td" let OperandType = "OPERAND_IMMEDIATE" in { @@ -238,14 +292,15 @@ def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; -def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; +def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; //===----------------------------------------------------------------------===// @@ -298,7 +353,7 @@ class SIMCInstr <string pseudo, int subtarget> { class EXPCommon : InstSI< (outs), (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", [] > { @@ -308,60 +363,157 @@ class EXPCommon : InstSI< multiclass EXP_m { - let isPseudo = 1 in { + let isPseudo = 1, isCodeGenOnly = 1 in { def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; } def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + + def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; } //===----------------------------------------------------------------------===// // Scalar classes //===----------------------------------------------------------------------===// -class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 < - op, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern +class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOP1 <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, + SOP1e <op.SI>, + SIMCInstr<opName, SISubtarget.SI>; + +class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : + SOP1 <outs, ins, asm, []>, + SOP1e <op.VI>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : SOP1_Pseudo <opName, outs, ins, pattern>; + + def _si : SOP1_Real_si <op, opName, outs, ins, asm>; + + def _vi : SOP1_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern >; -class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern +multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern >; +// no input, 64-bit output. +multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst"> { + let ssrc0 = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst"> { + let ssrc0 = 0; + } +} + +// 64-bit input, no output +multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs), (ins SReg_64:$src0), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs), (ins SReg_64:$src0), + opName#" $src0"> { + let sdst = 0; + } +} + // 64-bit input, 32-bit output. -class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < - op, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern +multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern >; -class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; +class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : + SOP2<outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + let Size = 4; -class SOP2_SELECT_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]", pattern ->; + // Pseudo instructions have no encodings, but adding this field here allows + // us to do: + // let sdst = xxx in { + // for multiclasses that include both real and pseudo instructions. + field bits<7> sdst = 0; +} -class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern ->; +class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, + SOP2e<op.SI>, + SIMCInstr<opName, SISubtarget.SI>; + +class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : + SOP2<outs, ins, asm, []>, + SOP2e<op.VI>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { + def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>; + + def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]">; + + def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]">; +} + +multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : SOP2_Pseudo <opName, outs, ins, pattern>; + + def _si : SOP2_Real_si <op, opName, outs, ins, asm>; -class SOP2_64_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern + def _vi : SOP2_Real_vi <op, opName, outs, ins, asm>; + +} + +multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern >; -class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern +multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern >; +multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; -class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt, +class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), opName#" $dst, $src0, $src1", []>; @@ -372,15 +524,44 @@ class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_64, i64, opName, cond>; -class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK < - op, (outs SReg_32:$dst), (ins u16imm:$src0), - opName#" $dst, $src0", pattern ->; +class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOPK <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} -class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK < - op, (outs SReg_64:$dst), (ins u16imm:$src0), - opName#" $dst, $src0", pattern ->; +class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, + SOPKe <op.SI>, + SIMCInstr<opName, SISubtarget.SI>; + +class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : + SOPK <outs, ins, asm, []>, + SOPKe <op.VI>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0), + pattern>; + + def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0">; + + def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0">; +} + +multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), pattern>; + + def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">; + + def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0">; +} //===----------------------------------------------------------------------===// // SMRD classes @@ -390,6 +571,7 @@ class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SMRD <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, @@ -398,6 +580,12 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, SMRDe <op, imm>, SIMCInstr<opName, SISubtarget.SI>; +class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMEMe_vi <op, imm>, + SIMCInstr<opName, SISubtarget.VI>; + multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, string asm, list<dag> pattern> { @@ -405,6 +593,11 @@ multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + } } multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, @@ -444,44 +637,27 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> { // Returns the register class to use for the destination of VOP[123C] // instructions for the given VT. class getVALUDstForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64); + RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, + !if(!eq(VT.Size, 64), VReg_64, + SReg_64)); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); + RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); } // Returns the register class to use for source 1 of VOP[12C] for the // given VT. class getVOPSrc1ForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64); -} - -// Returns the register classes for the source arguments of a VOP[12C] -// instruction for the given SrcVTs. -class getInRC32 <list<ValueType> SrcVT> { - list<RegisterClass> ret = [ - getVOPSrc0ForVT<SrcVT[0]>.ret, - getVOPSrc1ForVT<SrcVT[1]>.ret - ]; + RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); } // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); -} - -// Returns the register classes for the source arguments of a VOP3 -// instruction for the given SrcVTs. -class getInRC64 <list<ValueType> SrcVT> { - list<RegisterClass> ret = [ - getVOP3SrcForVT<SrcVT[0]>.ret, - getVOP3SrcForVT<SrcVT[1]>.ret, - getVOP3SrcForVT<SrcVT[2]>.ret - ]; + RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); } // Returns 1 if the source arguments have modifiers, 0 if they do not. @@ -491,15 +667,15 @@ class hasModifiers<ValueType SrcVT> { } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. -class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> { +class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 (ins))); } // Returns the input arguments for VOP3 instructions for the given SrcVT. -class getIns64 <RegisterClass Src0RC, RegisterClass Src1RC, - RegisterClass Src2RC, int NumSrcArgs, +class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers> { dag ret = @@ -549,7 +725,7 @@ class getAsm32 <int NumSrcArgs> { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. class getAsm64 <int NumSrcArgs, bit HasModifiers> { - string src0 = "$src0_modifiers,"; + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", " $src1_modifiers,")); @@ -570,11 +746,11 @@ class VOPProfile <list<ValueType> _ArgVT> { field ValueType Src1VT = ArgVT[2]; field ValueType Src2VT = ArgVT[3]; field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret; - field RegisterClass Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; + field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; - field RegisterClass Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; - field RegisterClass Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; - field RegisterClass Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; + field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; + field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; + field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; field bit HasModifiers = hasModifiers<Src0VT>.ret; @@ -604,14 +780,31 @@ def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { let Src0RC32 = VCSrc_32; } + +def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = " $dst, $src0_modifiers, $src1"; +} + +def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = " $dst, $src0_modifiers, $src1"; +} + def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); + field string Asm = " $dst, $src0, $vsrc1, $src2"; +} def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; @@ -633,8 +826,13 @@ class AtomicNoRet <string noRetOp, bit isRet> { class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP1Common <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + VOP <opName>, + SIMCInstr <opName#"_e32", SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; + + field bits<8> vdst; + field bits<9> src0; } multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, @@ -642,32 +840,99 @@ multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, def "" : VOP1_Pseudo <outs, ins, pattern, opName>; def _si : VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName, SISubtarget.SI>; + SIMCInstr <opName#"_e32", SISubtarget.SI>; + def _vi : VOP1<op.VI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>; +} + +multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName> { + def "" : VOP1_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + // No VI instruction. This class is for SI only. +} + +class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP2Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, string revOp> { + def "" : VOP2_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; +} + +multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, string revOp> { + def "" : VOP2_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>; } class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { bits<2> src0_modifiers = !if(HasModifiers, ?, 0); bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); - bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); bits<2> omod = !if(HasModifiers, ?, 0); bits<1> clamp = !if(HasModifiers, ?, 0); bits<9> src1 = !if(HasSrc1, ?, 0); bits<9> src2 = !if(HasSrc2, ?, 0); } +class VOP3DisableModFields <bit HasSrc0Mods, + bit HasSrc1Mods = 0, + bit HasSrc2Mods = 0, + bit HasOutputMods = 0> { + bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); + bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); + bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); + bits<2> omod = !if(HasOutputMods, ?, 0); + bits<1> clamp = !if(HasOutputMods, ?, 0); +} + class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP3Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName#"_e64", SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : - VOP3 <op, outs, ins, asm, []>, - SIMCInstr<opName, SISubtarget.SI>; - -multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern, + VOP3Common <outs, ins, asm, []>, + VOP3e <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI>; + +class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI>; + +class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI>; + +class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3be_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI>; + +multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, int NumSrcArgs, bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; @@ -676,7 +941,26 @@ multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern, VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), !if(!eq(NumSrcArgs, 2), 0, 1), HasMods>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; +} + +// VOP3_m without source modifiers +multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + let src0_modifiers = 0, + src1_modifiers = 0, + src2_modifiers = 0, + clamp = 0, + omod = 0 in { + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; + } } multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, @@ -686,6 +970,19 @@ multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<0, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; +} + +multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; + // No VI instruction. This class is for SI only. } multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, @@ -695,12 +992,28 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - def _si : VOP3_Real_si <op.SI3, - outs, ins, asm, opName>, - VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; +} + +multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods>; + + // No VI instruction. This class is for SI only. } +// XXX - Is v_div_scale_{f32|f64} only available in vop3b without +// option of implicit vcc use? multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, bit HasMods = 1, bit UseFullOp = 0> { @@ -711,13 +1024,27 @@ multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, // can write it into any SGPR. We currently don't use the carry out, // so for now hardcode it to VCC as well. let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b <op.SI3, outs, ins, asm, pattern>, - VOP3DisableFields<1, 0, HasMods>, - SIMCInstr<opName, SISubtarget.SI>, - VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>; + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods>; } // End sdst = SIOperand.VCC, Defs = [VCC] } +multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 1, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 1, HasMods>; +} + multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods, bit defExec> { @@ -725,17 +1052,39 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, def "" : VOP3_Pseudo <outs, ins, pattern, opName>; def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods> { + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + } + + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); } } +// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. +multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : VOPAnyCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE>; + } + + def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, + SIMCInstr <opName, SISubtarget.SI>; + + def _vi : VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op.VI3>, + VOP3DisableFields <1, 0, 0>, + SIMCInstr <opName, SISubtarget.VI>; +} + multiclass VOP1_Helper <vop1 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, bit HasMods> { - def _e32 : VOP1 <op.SI, outs, ins32, opName#asm32, pat32>, VOP<opName>; + defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>; } @@ -752,17 +1101,24 @@ multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, P.HasModifiers >; -class VOP2_e32 <bits<6> op, string opName, dag outs, dag ins, string asm, - list<dag> pattern, string revOp> : - VOP2 <op, outs, ins, opName#asm, pattern>, - VOP <opName>, - VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; +multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> { + + defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>; + + defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + opName, P.HasModifiers>; +} multiclass VOP2_Helper <vop2 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, string revOp, bit HasMods> { - def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>; + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods @@ -784,12 +1140,27 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, revOp, P.HasModifiers >; +multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> { + defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; + + defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#"_e64"#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + opName, revOp, P.HasModifiers>; +} + multiclass VOP2b_Helper <vop2 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, string revOp, bit HasMods> { - def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>; + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; defm _e64 : VOP3b_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods @@ -811,16 +1182,94 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, revOp, P.HasModifiers >; +// A VOP2 instruction that is VOP3-only on VI. +multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { + defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; + + defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, + revOp, HasMods>; +} + +multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> + : VOP2_VI3_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { + + def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>; + +let isCodeGenOnly = 0 in { + def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>, + VOP2_MADKe <op.SI>; + + def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, + !strconcat(opName, VOP_MADK.Asm), []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>, + VOP2_MADKe <op.VI>; +} // End isCodeGenOnly = 0 +} + +class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOPCCommon <ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, bit DefExec> { + def "" : VOPC_Pseudo <outs, ins, pattern, opName>; + + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [EXEC], []); + } + + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [EXEC], []); + } +} + multiclass VOPC_Helper <vopc op, string opName, dag ins32, string asm32, list<dag> pat32, dag out64, dag ins64, string asm64, list<dag> pat64, bit HasMods, bit DefExec> { - def _e32 : VOPC <op.SI, ins32, opName#asm32, pat32>, VOP <opName> { - let Defs = !if(DefExec, [EXEC], []); - } + defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; + + defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, + opName, HasMods, DefExec>; +} + +// Special case for class instructions which only have modifiers on +// the 1st source operand. +multiclass VOPC_Class_Helper <vopc op, string opName, + dag ins32, string asm32, list<dag> pat32, + dag out64, dag ins64, string asm64, list<dag> pat64, + bit HasMods, bit DefExec> { + defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, opName, - HasMods, DefExec>; + defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, + opName, HasMods, DefExec>, + VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst <vopc op, string opName, @@ -839,6 +1288,19 @@ multiclass VOPCInst <vopc op, string opName, P.HasModifiers, DefExec >; +multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, + bit DefExec = 0> : VOPC_Class_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs SReg_64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], + [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + P.HasModifiers, DefExec +>; + + multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : VOPCInst <op, opName, VOP_F32_F32_F32, cond>; @@ -873,6 +1335,18 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods >; +multiclass VOPC_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>; + +multiclass VOPCX_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>; + +multiclass VOPC_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>; + +multiclass VOPCX_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>; + multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < op, opName, P.Outs, P.Ins64, P.Asm64, @@ -901,9 +1375,31 @@ multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, P.NumSrcArgs, P.HasModifiers >; -multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterClass arc, +// Special case for v_div_fmas_{f32|f64}, since it seems to be the +// only VOP instruction that implicitly reads VCC. +multiclass VOP3_VCC_Inst <vop3 op, string opName, + VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, + P.Outs, + (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, + InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, + InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, + ClampMod:$clamp, + omod:$omod), + " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), + (i1 VCC)))], + 3, 1 +>; + +multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, string opName, list<dag> pattern> : - VOP3b_2_m < + VOP3b_3_m < op, (outs vrc:$vdst, SReg_64:$sdst), (ins InputModsNoDefault:$src0_modifiers, arc:$src0, InputModsNoDefault:$src1_modifiers, arc:$src1, @@ -917,7 +1413,7 @@ multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>; + VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>; class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< @@ -931,124 +1427,259 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< i32:$omod)>; //===----------------------------------------------------------------------===// +// Interpolation opcodes +//===----------------------------------------------------------------------===// + +class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + VINTRPCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon <outs, ins, asm, []>, + VINTRPe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon <outs, ins, asm, []>, + VINTRPe_vi <op>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm, + string disableEncoding = "", string constraints = "", + list<dag> pattern = []> { + let DisableEncoding = disableEncoding, + Constraints = constraints in { + def "" : VINTRP_Pseudo <opName, outs, ins, pattern>; + + def _si : VINTRP_Real_si <op, opName, outs, ins, asm>; + + def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm>; + } +} + +//===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// -class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> : - DS <op, outs, ins, asm, pat> { +class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + DS <outs, ins, "", pattern>, + SIMCInstr <opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe <op>, + SIMCInstr <opName, SISubtarget.SI>; + +class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe <op>, + SIMCInstr <opName, SISubtarget.SI> { + + // Single load interpret the 2 i8imm operands as a single i16 offset. bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; +} + +class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe_vi <op>, + SIMCInstr <opName, SISubtarget.VI> { // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; let offset0 = offset{7-0}; let offset1 = offset{15-8}; +} + +multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm, + list<dag> pat> { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; - let hasSideEffects = 0; + let data0 = 0, data1 = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < +multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_1A_Load_m < op, + asm, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset), - asm#" $vdst, $addr"#"$offset"#" [M0]", - []> { - let data0 = 0; - let data1 = 0; - let mayLoad = 1; - let mayStore = 0; + (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0), + asm#" $vdst, $addr"#"$offset", + []>; + +multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm, + list<dag> pat> { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < +multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_Load2_m < op, + asm, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1), - asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]", - []> { - let data0 = 0; - let data1 = 0; - let mayLoad = 1; - let mayStore = 0; - let hasSideEffects = 0; + (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + M0Reg:$m0), + asm#" $vdst, $addr"#"$offset0"#"$offset1", + []>; + +multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let data1 = 0, vdst = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < +multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_1A_Store_m < op, + asm, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset), - asm#" $addr, $data0"#"$offset"#" [M0]", - []> { - let data1 = 0; - let mayStore = 1; - let mayLoad = 0; - let vdst = 0; + (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0), + asm#" $addr, $data0"#"$offset", + []>; + +multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let vdst = 0 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < +multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_Store_m < op, + asm, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1, - ds_offset0:$offset0, ds_offset1:$offset1), - asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]", - []> { - let mayStore = 1; - let mayLoad = 0; - let hasSideEffects = 0; - let vdst = 0; -} + (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0), + asm#" $addr, $data0, $data1"#"$offset0"#"$offset1", + []>; // 1 address, 1 data. -class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A < - op, - (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset), - asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>, - AtomicNoRet<noRetOp, 1> { +multiclass DS_1A1D_RET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1, + hasPostISelHook = 1 // Adjusted to no return version. + in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 1>; + + let data1 = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } +} - let data1 = 0; - let mayStore = 1; - let mayLoad = 1; +multiclass DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = ""> : DS_1A1D_RET_m < + op, asm, + (outs rc:$vdst), + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), + asm#" $vdst, $addr, $data0"#"$offset", [], noRetOp>; - let hasPostISelHook = 1; // Adjusted to no return version. +// 1 address, 2 data. +multiclass DS_1A2D_RET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1, + hasPostISelHook = 1 // Adjusted to no return version. + in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 1>; + + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } } -// 1 address, 2 data. -class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A < - op, +multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = ""> : DS_1A2D_RET_m < + op, asm, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset), - asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]", - []>, - AtomicNoRet<noRetOp, 1> { - let mayStore = 1; - let mayLoad = 1; - let hasPostISelHook = 1; // Adjusted to no return version. -} + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), + asm#" $vdst, $addr, $data0, $data1"#"$offset", + [], noRetOp>; // 1 address, 2 data. -class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A < - op, - (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset), - asm#" $addr, $data0, $data1"#"$offset"#" [M0]", - []>, - AtomicNoRet<noRetOp, 0> { - let mayStore = 1; - let mayLoad = 1; +multiclass DS_1A2D_NORET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 0>; + + let vdst = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } } -// 1 address, 1 data. -class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A < - op, +multiclass DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = asm> : DS_1A2D_NORET_m < + op, asm, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset), - asm#" $addr, $data0"#"$offset"#" [M0]", - []>, - AtomicNoRet<noRetOp, 0> { + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), + asm#" $addr, $data0, $data1"#"$offset", + [], noRetOp>; - let data1 = 0; - let mayStore = 1; - let mayLoad = 1; +// 1 address, 1 data. +multiclass DS_1A1D_NORET_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat, string noRetOp> { + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>, + AtomicNoRet<noRetOp, 0>; + + let data1 = 0, vdst = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } } +multiclass DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, + string noRetOp = asm> : DS_1A1D_NORET_m < + op, asm, + (outs), + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), + asm#" $addr, $data0"#"$offset", + [], noRetOp>; + //===----------------------------------------------------------------------===// // MTBUF classes //===----------------------------------------------------------------------===// @@ -1057,6 +1688,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : MTBUF <outs, ins, "", pattern>, SIMCInstr<opName, SISubtarget.NONE> { let isPseudo = 1; + let isCodeGenOnly = 1; } class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, @@ -1065,6 +1697,11 @@ class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, MTBUFe <op>, SIMCInstr<opName, SISubtarget.SI>; +class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, list<dag> pattern> { @@ -1072,6 +1709,8 @@ multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, def _si : MTBUF_Real_si <op, opName, outs, ins, asm>; + def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; + } let mayStore = 1, mayLoad = 0 in { @@ -1080,8 +1719,8 @@ multiclass MTBUF_Store_Helper <bits<3> op, string opName, RegisterClass regClass> : MTBUF_m < op, opName, (outs), (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] >; @@ -1094,43 +1733,124 @@ multiclass MTBUF_Load_Helper <bits<3> op, string opName, RegisterClass regClass> : MTBUF_m < op, opName, (outs regClass:$dst), (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), + i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] >; } // mayLoad = 1, mayStore = 0 -class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { +//===----------------------------------------------------------------------===// +// MUBUF classes +//===----------------------------------------------------------------------===// +class mubuf <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { bit IsAddr64 = is_addr64; string OpName = NAME # suffix; } -class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> - : MUBUF <op, outs, ins, asm, pattern> { +class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MUBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; + + // dummy fields, so that we can use let statements around multiclasses + bits<1> offen; + bits<1> idxen; + bits<8> vaddr; + bits<1> glc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe <op.SI>, + SIMCInstr<opName, SISubtarget.SI> { + let lds = 0; +} - let offen = 0; - let idxen = 0; - let addr64 = 1; - let tfe = 0; +class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins, + string asm> : + MUBUF <outs, ins, asm, []>, + MUBUFe_vi <op.VI>, + SIMCInstr<opName, SISubtarget.VI> { let lds = 0; - let soffset = 128; } -class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> - : MUBUF <op, outs, ins, asm, pattern> { +multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0>; - let offen = 0; - let idxen = 0; - let addr64 = 0; - let tfe = 0; + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; +} + +multiclass MUBUFAddr64_m <mubuf op, string opName, dag outs, + dag ins, string asm, list<dag> pattern> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1>; + + let addr64 = 1 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { let lds = 0; - let vaddr = 0; } -multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc, +multiclass MUBUFAtomicOffset_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_OFFSET", is_return>; + + let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, + AtomicNoRet<NAME#"_ADDR64", is_return>; + + let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, ValueType vt, SDPatternOperator atomic> { let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { @@ -1138,174 +1858,149 @@ multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc, // No return variants let glc = 0 in { - def _ADDR64 : MUBUFAtomicAddr64 < - op, (outs), + defm _ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_addr64", (outs), (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", [] - >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>; + mbuf_offset:$offset, SCSrc_32:$soffset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 + >; - def _OFFSET : MUBUFAtomicOffset < - op, (outs), + defm _OFFSET : MUBUFAtomicOffset_m < + op, name#"_offset", (outs), (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SSrc_32:$soffset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [] - >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>; + SCSrc_32:$soffset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + >; } // glc = 0 // Variant that return values let glc = 1, Constraints = "$vdata = $vdata_in", DisableEncoding = "$vdata_in" in { - def _RTN_ADDR64 : MUBUFAtomicAddr64 < - op, (outs rc:$vdata), + defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_rtn_addr64", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc", + mbuf_offset:$offset, SSrc_32:$soffset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, - (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset, - i1:$slc), vt:$vdata_in))] - >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>; + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$vdata_in))], 1 + >; - def _RTN_OFFSET : MUBUFAtomicOffset < - op, (outs rc:$vdata), + defm _RTN_OFFSET : MUBUFAtomicOffset_m < + op, name#"_rtn_offset", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset, - SSrc_32:$soffset, slc:$slc), + SCSrc_32:$soffset, slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$slc), vt:$vdata_in))] - >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>; + i1:$slc), vt:$vdata_in))], 1 + >; } // glc = 1 } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } -multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, +multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { - let lds = 0, mayLoad = 1 in { + let mayLoad = 1, mayStore = 0 in { + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), + (ins SReg_128:$srsrc, + mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe)))]>; + } - let addr64 = 0 in { + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), + (ins SReg_128:$srsrc, VGPR_32:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), + (ins SReg_128:$srsrc, VGPR_32:$vaddr, + mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } - let offen = 0, idxen = 0, vaddr = 0 in { - def _OFFSET : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, - mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc, - slc:$slc, tfe:$tfe), - asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", - [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, - i32:$soffset, i16:$offset, - i1:$glc, i1:$slc, i1:$tfe)))]>, - MUBUFAddr64Table<0>; - } - - let offen = 1, idxen = 0 in { - def _OFFEN : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_32:$vaddr, - SSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, - tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; - } - - let offen = 0, idxen = 1 in { - def _IDXEN : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_32:$vaddr, - mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc, - slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; - } - - let offen = 1, idxen = 1 in { - def _BOTHEN : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, - SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; - } + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), + (ins SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } - let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { - def _ADDR64 : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), - asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", + let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), + (ins SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset", [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, - i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>; + i64:$vaddr, i32:$soffset, + i16:$offset)))]>; } } } -multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, +multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, ValueType store_vt, SDPatternOperator st> { - - let addr64 = 0, lds = 0 in { - - def "" : MUBUF < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, - mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, - tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# - "$glc"#"$slc"#"$tfe", - [] - >; + let mayLoad = 0, mayStore = 1 in { + defm : MUBUF_m <op, name, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; let offen = 0, idxen = 0, vaddr = 0 in { - def _OFFSET : MUBUF < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, - SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", - [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, - i1:$tfe))] - >, MUBUFAddr64Table<0>; + defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, + SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; } // offen = 0, idxen = 0, vaddr = 0 let offen = 1, idxen = 0 in { - def _OFFEN : MUBUF < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# - "$glc"#"$slc"#"$tfe", - [] - >; + defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# + "$glc"#"$slc"#"$tfe", []>; } // end offen = 1, idxen = 0 - } // End addr64 = 0, lds = 0 - - def _ADDR64 : MUBUF < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), - name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", - [(st store_vt:$vdata, - (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1> - { - - let mayLoad = 0; - let mayStore = 1; - - // Encoding - let offen = 0; - let idxen = 0; - let glc = 0; - let addr64 = 1; - let lds = 0; - let slc = 0; - let tfe = 0; - let soffset = 128; // ZERO - } + let offen = 0, idxen = 0, glc = 0, slc = 0, tfe = 0 in { + defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, + VReg_64:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, + i32:$soffset, i16:$offset))]>; + } + } // End mayLoad = 0, mayStore = 1 } class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : - FLAT <op, (outs regClass:$data), + FLAT <op, (outs regClass:$vdst), (ins VReg_64:$addr), - asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> { + asm#" $vdst, $addr, [M0, FLAT_SCRATCH]", []> { let glc = 0; let slc = 0; let tfe = 0; + let data = 0; let mayLoad = 1; } @@ -1321,6 +2016,7 @@ class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : let glc = 0; let slc = 0; let tfe = 0; + let vdst = 0; } class MIMG_Mask <string op, int channels> { @@ -1339,7 +2035,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm, asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," #" $tfe, $lwe, $slc, $vaddr, $srsrc", []> { - let SSAMP = 0; + let ssamp = 0; let mayLoad = 1; let mayStore = 0; let hasPostISelHook = 1; @@ -1348,7 +2044,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm, multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>, + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, MIMG_Mask<asm#"_V2", channels>; @@ -1357,7 +2053,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, } multiclass MIMG_NoSampler <bits<7> op, string asm> { - defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>; + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; @@ -1365,7 +2061,7 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> { class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < + RegisterClass src_rc, int wqm> : MIMG < op, (outs dst_rc:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, @@ -1377,33 +2073,41 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, let mayLoad = 1; let mayStore = 0; let hasPostISelHook = 1; + let WQM = wqm; } multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>, + int channels, int wqm> { + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>, MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>, + def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>, + def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>, + def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>, + def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, MIMG_Mask<asm#"_V16", channels>; } multiclass MIMG_Sampler <bits<7> op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>; - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>; - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>; - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>; + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Sampler_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>; } class MIMG_Gather_Helper <bits<7> op, string asm, RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < + RegisterClass src_rc, int wqm> : MIMG < op, (outs dst_rc:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, @@ -1424,28 +2128,36 @@ class MIMG_Gather_Helper <bits<7> op, string asm, // Therefore, disable all code which updates DMASK by setting these two: let MIMG = 0; let hasPostISelHook = 0; + let WQM = wqm; } multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>, + int channels, int wqm> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>, + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>, + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>, + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>, + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, MIMG_Mask<asm#"_V16", channels>; } multiclass MIMG_Gather <bits<7> op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>; - defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>; - defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>; - defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>; + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>; +} + +multiclass MIMG_Gather_WQM <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>; } //===----------------------------------------------------------------------===// @@ -1496,20 +2208,12 @@ def getCommuteOrig : InstrMapping { let ValueCols = [["1"]]; } -def isDS : InstrMapping { - let FilterClass = "DS"; - let RowFields = ["Inst"]; - let ColFields = ["Size"]; - let KeyCol = ["8"]; - let ValueCols = [["8"]]; -} - -def getMCOpcode : InstrMapping { +def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; let ColFields = ["Subtarget"]; let KeyCol = [!cast<string>(SISubtarget.NONE)]; - let ValueCols = [[!cast<string>(SISubtarget.SI)]]; + let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]]; } def getAddr64Inst : InstrMapping { @@ -1539,3 +2243,5 @@ def getAtomicNoRetOp : InstrMapping { } include "SIInstructions.td" +include "CIInstructions.td" +include "VIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 90da7a9..4f72e99 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -26,11 +26,18 @@ def SendMsgImm : Operand<i32> { let PrintMethod = "printSendMsg"; } -def isSI : Predicate<"Subtarget.getGeneration() " +def isGCN : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">; - -def isCI : Predicate<"Subtarget.getGeneration() " +def isSICI : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" +>; +def isCI : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS" +>; + def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; def SWaitMatchClass : AsmOperandClass { @@ -43,7 +50,7 @@ def WAIT_FLAG : InstFlag<"printWaitFlag"> { let ParserMatchClass = SWaitMatchClass; } -let SubtargetPredicate = isSI in { +let SubtargetPredicate = isGCN in { //===----------------------------------------------------------------------===// // EXP Instructions @@ -96,90 +103,99 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < //===----------------------------------------------------------------------===// let isMoveImm = 1 in { -def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>; -def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>; -def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>; -def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>; + let isReMaterializable = 1 in { + defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; + defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; + } // let isRematerializeable = 1 + + let Uses = [SCC] in { + defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>; + defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>; + } // End Uses = [SCC] } // End isMoveImm = 1 -def S_NOT_B32 : SOP1_32 <0x00000007, "s_not_b32", - [(set i32:$dst, (not i32:$src0))] ->; +let Defs = [SCC] in { + defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32", + [(set i32:$dst, (not i32:$src0))] + >; -def S_NOT_B64 : SOP1_64 <0x00000008, "s_not_b64", - [(set i64:$dst, (not i64:$src0))] ->; -def S_WQM_B32 : SOP1_32 <0x00000009, "s_wqm_b32", []>; -def S_WQM_B64 : SOP1_64 <0x0000000a, "s_wqm_b64", []>; -def S_BREV_B32 : SOP1_32 <0x0000000b, "s_brev_b32", + defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64", + [(set i64:$dst, (not i64:$src0))] + >; + defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>; + defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>; +} // End Defs = [SCC] + + +defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", [(set i32:$dst, (AMDGPUbrev i32:$src0))] >; -def S_BREV_B64 : SOP1_64 <0x0000000c, "s_brev_b64", []>; +defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; -////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "s_bcnt0_i32_b32", []>; -////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "s_bcnt0_i32_b64", []>; -def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "s_bcnt1_i32_b32", - [(set i32:$dst, (ctpop i32:$src0))] ->; -def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "s_bcnt1_i32_b64", []>; +let Defs = [SCC] in { + defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; + defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; + defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", + [(set i32:$dst, (ctpop i32:$src0))] + >; + defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; +} // End Defs = [SCC] -////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "s_ff0_i32_b32", []>; -////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "s_ff0_i32_b64", []>; -def S_FF1_I32_B32 : SOP1_32 <0x00000013, "s_ff1_i32_b32", +defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; +defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; +defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", [(set i32:$dst, (cttz_zero_undef i32:$src0))] >; -////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "s_ff1_i32_b64", []>; +defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; -def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "s_flbit_i32_b32", +defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", [(set i32:$dst, (ctlz_zero_undef i32:$src0))] >; -//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "s_flbit_i32_b64", []>; -def S_FLBIT_I32 : SOP1_32 <0x00000017, "s_flbit_i32", []>; -//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "s_flbit_i32_i64", []>; -def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "s_sext_i32_i8", +defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; +defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>; +defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; +defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", [(set i32:$dst, (sext_inreg i32:$src0, i8))] >; -def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "s_sext_i32_i16", +defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", [(set i32:$dst, (sext_inreg i32:$src0, i16))] >; -////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "s_bitset0_b32", []>; -////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "s_bitset0_b64", []>; -////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "s_bitset1_b32", []>; -////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "s_bitset1_b64", []>; -def S_GETPC_B64 : SOP1 < - 0x0000001f, (outs SReg_64:$dst), (ins), "s_getpc_b64 $dst", [] -> { - let SSRC0 = 0; -} -def S_SETPC_B64 : SOP1_64 <0x00000020, "s_setpc_b64", []>; -def S_SWAPPC_B64 : SOP1_64 <0x00000021, "s_swappc_b64", []>; -def S_RFE_B64 : SOP1_64 <0x00000022, "s_rfe_b64", []>; - -let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in { - -def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "s_and_saveexec_b64", []>; -def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "s_or_saveexec_b64", []>; -def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "s_xor_saveexec_b64", []>; -def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "s_andn2_saveexec_b64", []>; -def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "s_orn2_saveexec_b64", []>; -def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "s_nand_saveexec_b64", []>; -def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "s_nor_saveexec_b64", []>; -def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "s_xnor_saveexec_b64", []>; - -} // End hasSideEffects = 1 - -def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "s_quadmask_b32", []>; -def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "s_quadmask_b64", []>; -def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "s_movrels_b32", []>; -def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "s_movrels_b64", []>; -def S_MOVRELD_B32 : SOP1_32 <0x00000030, "s_movreld_b32", []>; -def S_MOVRELD_B64 : SOP1_64 <0x00000031, "s_movreld_b64", []>; -//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "s_cbranch_join", []>; -def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "s_mov_regrd_b32", []>; -def S_ABS_I32 : SOP1_32 <0x00000034, "s_abs_i32", []>; -def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>; +defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; +defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; +defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; +defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; +defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; +defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; +defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; +defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { + +defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>; +defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>; +defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>; +defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>; +defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>; + +} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] + +defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; +defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; +defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; +defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; +defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; +defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; +defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; +let Defs = [SCC] in { + defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>; +} // End Defs = [SCC] +defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>; //===----------------------------------------------------------------------===// // SOP2 Instructions @@ -187,119 +203,132 @@ def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>; let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { -def S_ADD_U32 : SOP2_32 <0x00000000, "s_add_u32", []>; -def S_ADD_I32 : SOP2_32 <0x00000002, "s_add_i32", +defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>; +defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32", [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] >; } // End isCommutable = 1 -def S_SUB_U32 : SOP2_32 <0x00000001, "s_sub_u32", []>; -def S_SUB_I32 : SOP2_32 <0x00000003, "s_sub_i32", +defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>; +defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32", [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { -def S_ADDC_U32 : SOP2_32 <0x00000004, "s_addc_u32", +defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32", [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End isCommutable = 1 -def S_SUBB_U32 : SOP2_32 <0x00000005, "s_subb_u32", +defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End Uses = [SCC] -} // End Defs = [SCC] -def S_MIN_I32 : SOP2_32 <0x00000006, "s_min_i32", +defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] >; -def S_MIN_U32 : SOP2_32 <0x00000007, "s_min_u32", +defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] >; -def S_MAX_I32 : SOP2_32 <0x00000008, "s_max_i32", +defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] >; -def S_MAX_U32 : SOP2_32 <0x00000009, "s_max_u32", +defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] >; +} // End Defs = [SCC] -def S_CSELECT_B32 : SOP2_SELECT_32 < - 0x0000000a, "s_cselect_b32", - [] ->; +defm S_CSELECT_B32 : SOP2_SELECT_32 <sop2<0x0a>, "s_cselect_b32", []>; -def S_CSELECT_B64 : SOP2_64 <0x0000000b, "s_cselect_b64", []>; +let Uses = [SCC] in { + defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>; +} // End Uses = [SCC] -def S_AND_B32 : SOP2_32 <0x0000000e, "s_and_b32", +let Defs = [SCC] in { +defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32", [(set i32:$dst, (and i32:$src0, i32:$src1))] >; -def S_AND_B64 : SOP2_64 <0x0000000f, "s_and_b64", +defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64", [(set i64:$dst, (and i64:$src0, i64:$src1))] >; -def S_OR_B32 : SOP2_32 <0x00000010, "s_or_b32", +defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32", [(set i32:$dst, (or i32:$src0, i32:$src1))] >; -def S_OR_B64 : SOP2_64 <0x00000011, "s_or_b64", +defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64", [(set i64:$dst, (or i64:$src0, i64:$src1))] >; -def S_XOR_B32 : SOP2_32 <0x00000012, "s_xor_b32", +defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32", [(set i32:$dst, (xor i32:$src0, i32:$src1))] >; -def S_XOR_B64 : SOP2_64 <0x00000013, "s_xor_b64", +defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64", [(set i64:$dst, (xor i64:$src0, i64:$src1))] >; -def S_ANDN2_B32 : SOP2_32 <0x00000014, "s_andn2_b32", []>; -def S_ANDN2_B64 : SOP2_64 <0x00000015, "s_andn2_b64", []>; -def S_ORN2_B32 : SOP2_32 <0x00000016, "s_orn2_b32", []>; -def S_ORN2_B64 : SOP2_64 <0x00000017, "s_orn2_b64", []>; -def S_NAND_B32 : SOP2_32 <0x00000018, "s_nand_b32", []>; -def S_NAND_B64 : SOP2_64 <0x00000019, "s_nand_b64", []>; -def S_NOR_B32 : SOP2_32 <0x0000001a, "s_nor_b32", []>; -def S_NOR_B64 : SOP2_64 <0x0000001b, "s_nor_b64", []>; -def S_XNOR_B32 : SOP2_32 <0x0000001c, "s_xnor_b32", []>; -def S_XNOR_B64 : SOP2_64 <0x0000001d, "s_xnor_b64", []>; +defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>; +defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>; +defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>; +defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>; +defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>; +defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>; +defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>; +defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>; +defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>; +defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>; +} // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. let AddedComplexity = 1 in { +let Defs = [SCC] in { -def S_LSHL_B32 : SOP2_32 <0x0000001e, "s_lshl_b32", +defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32", [(set i32:$dst, (shl i32:$src0, i32:$src1))] >; -def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "s_lshl_b64", +defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] >; -def S_LSHR_B32 : SOP2_32 <0x00000020, "s_lshr_b32", +defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32", [(set i32:$dst, (srl i32:$src0, i32:$src1))] >; -def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "s_lshr_b64", +defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64", [(set i64:$dst, (srl i64:$src0, i32:$src1))] >; -def S_ASHR_I32 : SOP2_32 <0x00000022, "s_ashr_i32", +defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32", [(set i32:$dst, (sra i32:$src0, i32:$src1))] >; -def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "s_ashr_i64", +defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", [(set i64:$dst, (sra i64:$src0, i32:$src1))] >; +} // End Defs = [SCC] - -def S_BFM_B32 : SOP2_32 <0x00000024, "s_bfm_b32", []>; -def S_BFM_B64 : SOP2_64 <0x00000025, "s_bfm_b64", []>; -def S_MUL_I32 : SOP2_32 <0x00000026, "s_mul_i32", +defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>; +defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; +defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", [(set i32:$dst, (mul i32:$src0, i32:$src1))] >; } // End AddedComplexity = 1 -def S_BFE_U32 : SOP2_32 <0x00000027, "s_bfe_u32", []>; -def S_BFE_I32 : SOP2_32 <0x00000028, "s_bfe_i32", []>; -def S_BFE_U64 : SOP2_64 <0x00000029, "s_bfe_u64", []>; -def S_BFE_I64 : SOP2_64_32 <0x0000002a, "s_bfe_i64", []>; -//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "s_cbranch_g_fork", []>; -def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "s_absdiff_i32", []>; +let Defs = [SCC] in { +defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>; +defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>; +defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>; +defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; +} // End Defs = [SCC] + +let sdst = 0 in { +defm S_CBRANCH_G_FORK : SOP2_m < + sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), + (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] +>; +} + +let Defs = [SCC] in { +defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; +} // End Defs = [SCC] //===----------------------------------------------------------------------===// // SOPC Instructions @@ -328,9 +357,13 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; //===----------------------------------------------------------------------===// let isReMaterializable = 1 in { -def S_MOVK_I32 : SOPK_32 <0x00000000, "s_movk_i32", []>; +defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; } // End isReMaterializable = 1 -def S_CMOVK_I32 : SOPK_32 <0x00000002, "s_cmovk_i32", []>; +let Uses = [SCC] in { + defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>; +} + +let isCompare = 1 in { /* This instruction is disabled for now until we can figure out how to teach @@ -344,38 +377,36 @@ SCC = S_CMPK_EQ_I32 SGPR0, imm VCC = COPY SCC VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 -def S_CMPK_EQ_I32 : SOPK < - 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), - "s_cmpk_eq_i32", +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] >; */ -let isCompare = 1, Defs = [SCC] in { -def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "s_cmpk_lg_i32", []>; -def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "s_cmpk_gt_i32", []>; -def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "s_cmpk_ge_i32", []>; -def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "s_cmpk_lt_i32", []>; -def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "s_cmpk_le_i32", []>; -def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "s_cmpk_eq_u32", []>; -def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "s_cmpk_lg_u32", []>; -def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "s_cmpk_gt_u32", []>; -def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "s_cmpk_ge_u32", []>; -def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "s_cmpk_lt_u32", []>; -def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "s_cmpk_le_u32", []>; -} // End isCompare = 1, Defs = [SCC] - -let Defs = [SCC], isCommutable = 1 in { - def S_ADDK_I32 : SOPK_32 <0x0000000f, "s_addk_i32", []>; - def S_MULK_I32 : SOPK_32 <0x00000010, "s_mulk_i32", []>; +defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>; +defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>; +defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>; +defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>; +defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>; +defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>; +defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>; +defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>; +defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>; +defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>; +defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>; +} // End isCompare = 1 + +let isCommutable = 1 in { + let Defs = [SCC], isCommutable = 1 in { + defm S_ADDK_I32 : SOPK_32 <sopk<0x0f, 0x0e>, "s_addk_i32", []>; + } + defm S_MULK_I32 : SOPK_32 <sopk<0x10, 0x0f>, "s_mulk_i32", []>; } -//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "s_cbranch_i_fork", []>; -def S_GETREG_B32 : SOPK_32 <0x00000012, "s_getreg_b32", []>; -def S_SETREG_B32 : SOPK_32 <0x00000013, "s_setreg_b32", []>; -def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "s_getreg_regrd_b32", []>; -//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "s_setreg_imm32_b32", []>; -//def EXP : EXP_ <0x00000000, "exp", []>; +//defm S_CBRANCH_I_FORK : SOPK_ <sopk<0x11, 0x10>, "s_cbranch_i_fork", []>; +defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>; +defm S_SETREG_B32 : SOPK_32 <sopk<0x13, 0x12>, "s_setreg_b32", []>; +defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; +//defm S_SETREG_IMM32_B32 : SOPK_32 <sopk<0x15, 0x14>, "s_setreg_imm32_b32", []>; //===----------------------------------------------------------------------===// // SOPP Instructions @@ -476,82 +507,84 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { let isCompare = 1 in { -defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0>, "v_cmp_f_f32">; -defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1>, "v_cmp_lt_f32", COND_OLT>; -defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2>, "v_cmp_eq_f32", COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3>, "v_cmp_le_f32", COND_OLE>; -defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4>, "v_cmp_gt_f32", COND_OGT>; -defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5>, "v_cmp_lg_f32">; -defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6>, "v_cmp_ge_f32", COND_OGE>; -defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7>, "v_cmp_o_f32", COND_O>; -defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8>, "v_cmp_u_f32", COND_UO>; -defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9>, "v_cmp_nge_f32">; -defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa>, "v_cmp_nlg_f32">; -defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb>, "v_cmp_ngt_f32">; -defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc>, "v_cmp_nle_f32">; -defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd>, "v_cmp_neq_f32", COND_UNE>; -defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe>, "v_cmp_nlt_f32">; -defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf>, "v_cmp_tru_f32">; +defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>; +defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>; +defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; +defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT>; +defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>; +defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; +defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; +defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; let hasSideEffects = 1 in { -defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10>, "v_cmpx_f_f32">; -defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11>, "v_cmpx_lt_f32">; -defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12>, "v_cmpx_eq_f32">; -defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13>, "v_cmpx_le_f32">; -defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14>, "v_cmpx_gt_f32">; -defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15>, "v_cmpx_lg_f32">; -defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16>, "v_cmpx_ge_f32">; -defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17>, "v_cmpx_o_f32">; -defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18>, "v_cmpx_u_f32">; -defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19>, "v_cmpx_nge_f32">; -defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a>, "v_cmpx_nlg_f32">; -defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b>, "v_cmpx_ngt_f32">; -defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c>, "v_cmpx_nle_f32">; -defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d>, "v_cmpx_neq_f32">; -defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e>, "v_cmpx_nlt_f32">; -defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f>, "v_cmpx_tru_f32">; +defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; } // End hasSideEffects = 1 -defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20>, "v_cmp_f_f64">; -defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21>, "v_cmp_lt_f64", COND_OLT>; -defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22>, "v_cmp_eq_f64", COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23>, "v_cmp_le_f64", COND_OLE>; -defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24>, "v_cmp_gt_f64", COND_OGT>; -defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25>, "v_cmp_lg_f64">; -defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26>, "v_cmp_ge_f64", COND_OGE>; -defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27>, "v_cmp_o_f64", COND_O>; -defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28>, "v_cmp_u_f64", COND_UO>; -defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29>, "v_cmp_nge_f64">; -defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a>, "v_cmp_nlg_f64">; -defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b>, "v_cmp_ngt_f64">; -defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c>, "v_cmp_nle_f64">; -defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d>, "v_cmp_neq_f64", COND_UNE>; -defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e>, "v_cmp_nlt_f64">; -defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f>, "v_cmp_tru_f64">; +defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>; +defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>; +defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; +defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>; +defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>; +defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; +defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; +defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; let hasSideEffects = 1 in { -defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30>, "v_cmpx_f_f64">; -defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31>, "v_cmpx_lt_f64">; -defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32>, "v_cmpx_eq_f64">; -defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33>, "v_cmpx_le_f64">; -defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34>, "v_cmpx_gt_f64">; -defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35>, "v_cmpx_lg_f64">; -defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36>, "v_cmpx_ge_f64">; -defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37>, "v_cmpx_o_f64">; -defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38>, "v_cmpx_u_f64">; -defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39>, "v_cmpx_nge_f64">; -defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a>, "v_cmpx_nlg_f64">; -defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b>, "v_cmpx_ngt_f64">; -defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c>, "v_cmpx_nle_f64">; -defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d>, "v_cmpx_neq_f64">; -defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e>, "v_cmpx_nlt_f64">; -defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f>, "v_cmpx_tru_f64">; +defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; } // End hasSideEffects = 1 +let SubtargetPredicate = isSICI in { + defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">; defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; @@ -628,104 +661,106 @@ defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; } // End hasSideEffects = 1, Defs = [EXEC] -defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80>, "v_cmp_f_i32">; -defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81>, "v_cmp_lt_i32", COND_SLT>; -defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82>, "v_cmp_eq_i32", COND_EQ>; -defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83>, "v_cmp_le_i32", COND_SLE>; -defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84>, "v_cmp_gt_i32", COND_SGT>; -defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85>, "v_cmp_ne_i32", COND_NE>; -defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86>, "v_cmp_ge_i32", COND_SGE>; -defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87>, "v_cmp_t_i32">; +} // End SubtargetPredicate = isSICI + +defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>; +defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>; +defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; +defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; let hasSideEffects = 1 in { -defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90>, "v_cmpx_f_i32">; -defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91>, "v_cmpx_lt_i32">; -defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92>, "v_cmpx_eq_i32">; -defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93>, "v_cmpx_le_i32">; -defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94>, "v_cmpx_gt_i32">; -defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95>, "v_cmpx_ne_i32">; -defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96>, "v_cmpx_ge_i32">; -defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97>, "v_cmpx_t_i32">; +defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; } // End hasSideEffects = 1 -defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0>, "v_cmp_f_i64">; -defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1>, "v_cmp_lt_i64", COND_SLT>; -defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2>, "v_cmp_eq_i64", COND_EQ>; -defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3>, "v_cmp_le_i64", COND_SLE>; -defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4>, "v_cmp_gt_i64", COND_SGT>; -defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5>, "v_cmp_ne_i64", COND_NE>; -defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6>, "v_cmp_ge_i64", COND_SGE>; -defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7>, "v_cmp_t_i64">; +defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>; +defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>; +defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; +defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; let hasSideEffects = 1 in { -defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0>, "v_cmpx_f_i64">; -defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1>, "v_cmpx_lt_i64">; -defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2>, "v_cmpx_eq_i64">; -defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3>, "v_cmpx_le_i64">; -defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4>, "v_cmpx_gt_i64">; -defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5>, "v_cmpx_ne_i64">; -defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6>, "v_cmpx_ge_i64">; -defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7>, "v_cmpx_t_i64">; +defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; } // End hasSideEffects = 1 -defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0>, "v_cmp_f_u32">; -defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1>, "v_cmp_lt_u32", COND_ULT>; -defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2>, "v_cmp_eq_u32", COND_EQ>; -defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3>, "v_cmp_le_u32", COND_ULE>; -defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4>, "v_cmp_gt_u32", COND_UGT>; -defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5>, "v_cmp_ne_u32", COND_NE>; -defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6>, "v_cmp_ge_u32", COND_UGE>; -defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7>, "v_cmp_t_u32">; +defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>; +defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>; +defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; let hasSideEffects = 1 in { -defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0>, "v_cmpx_f_u32">; -defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1>, "v_cmpx_lt_u32">; -defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2>, "v_cmpx_eq_u32">; -defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3>, "v_cmpx_le_u32">; -defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4>, "v_cmpx_gt_u32">; -defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5>, "v_cmpx_ne_u32">; -defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6>, "v_cmpx_ge_u32">; -defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7>, "v_cmpx_t_u32">; +defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; } // End hasSideEffects = 1 -defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0>, "v_cmp_f_u64">; -defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1>, "v_cmp_lt_u64", COND_ULT>; -defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2>, "v_cmp_eq_u64", COND_EQ>; -defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3>, "v_cmp_le_u64", COND_ULE>; -defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4>, "v_cmp_gt_u64", COND_UGT>; -defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5>, "v_cmp_ne_u64", COND_NE>; -defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6>, "v_cmp_ge_u64", COND_UGE>; -defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7>, "v_cmp_t_u64">; +defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>; +defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>; +defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; let hasSideEffects = 1 in { -defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0>, "v_cmpx_f_u64">; -defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1>, "v_cmpx_lt_u64">; -defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2>, "v_cmpx_eq_u64">; -defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3>, "v_cmpx_le_u64">; -defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4>, "v_cmpx_gt_u64">; -defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5>, "v_cmpx_ne_u64">; -defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6>, "v_cmpx_ge_u64">; -defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7>, "v_cmpx_t_u64">; +defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; } // End hasSideEffects = 1 -defm V_CMP_CLASS_F32 : VOPC_F32 <vopc<0x88>, "v_cmp_class_f32">; +defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; let hasSideEffects = 1 in { -defm V_CMPX_CLASS_F32 : VOPCX_F32 <vopc<0x98>, "v_cmpx_class_f32">; +defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; } // End hasSideEffects = 1 -defm V_CMP_CLASS_F64 : VOPC_F64 <vopc<0xa8>, "v_cmp_class_f64">; +defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; let hasSideEffects = 1 in { -defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">; +defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; } // End hasSideEffects = 1 } // End isCompare = 1 @@ -735,88 +770,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">; //===----------------------------------------------------------------------===// -def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VReg_32>; -def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VReg_32>; -def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VReg_32>; -def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VReg_32>; -def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VReg_32>; -def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VReg_32>; -def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VReg_32>; -def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VReg_32>; -def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VReg_32>; -def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VReg_32>; -def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VReg_32>; -def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VReg_32>; -def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VReg_32>; -def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VReg_32>; -def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VReg_32>; -def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VReg_32>; -def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VReg_32>; - -def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VReg_32, "ds_add_u32">; -def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VReg_32, "ds_sub_u32">; -def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VReg_32, "ds_rsub_u32">; -def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VReg_32, "ds_inc_u32">; -def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VReg_32, "ds_dec_u32">; -def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VReg_32, "ds_min_i32">; -def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VReg_32, "ds_max_i32">; -def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VReg_32, "ds_min_u32">; -def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VReg_32, "ds_max_u32">; -def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VReg_32, "ds_and_b32">; -def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VReg_32, "ds_or_b32">; -def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VReg_32, "ds_xor_b32">; -def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VReg_32, "ds_mskor_b32">; -def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VReg_32>; -//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2_b32">; -//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2st64_b32">; -def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VReg_32, "ds_cmpst_b32">; -def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VReg_32, "ds_cmpst_f32">; -def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VReg_32, "ds_min_f32">; -def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VReg_32, "ds_max_f32">; +defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; +defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; +defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; +defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; +defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; +defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; +defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; +defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; +defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; +defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; +defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; +defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; +defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; +defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; +//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">; +//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; let SubtargetPredicate = isCI in { -def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VReg_32, "ds_wrap_f32">; +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; } // End isCI -def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; -def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; -def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; -def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; -def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; -def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; -def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; -def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; -def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; -def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; -def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; -def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; -def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; -def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; -def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; -def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; - -def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; -def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; -def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; -def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; -def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; -def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; -def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; -def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; -def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; -def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; -def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; -def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; -def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; //def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">; //def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">; -def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; -def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; -def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">; -def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; +defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; //let SubtargetPredicate = isCI in { // DS_CONDXCHG32_RTN_B64 @@ -825,139 +860,140 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">; // TODO: _SRC2_* forms -def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VReg_32>; -def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VReg_32>; -def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VReg_32>; -def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>; +defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>; +defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>; +defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>; -def DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VReg_32>; -def DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VReg_32>; -def DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VReg_32>; -def DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VReg_32>; -def DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VReg_32>; -def DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>; +defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>; +defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>; +defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>; // 2 forms. -def DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VReg_32>; -def DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VReg_32>; -def DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>; -def DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>; +defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>; +defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>; -def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; -def DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; -def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>; -def DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>; +defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; +defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>; //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// -//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>; -//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>; -//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>; -//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>; -//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>; -//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>; -//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>; +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>; +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>; +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>; +//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>; +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>; +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>; +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - 0x00000008, "buffer_load_ubyte", VReg_32, i32, az_extloadi8_global + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - 0x00000009, "buffer_load_sbyte", VReg_32, i32, sextloadi8_global + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - 0x0000000a, "buffer_load_ushort", VReg_32, i32, az_extloadi16_global + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - 0x0000000b, "buffer_load_sshort", VReg_32, i32, sextloadi16_global + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - 0x0000000c, "buffer_load_dword", VReg_32, i32, global_load + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - 0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - 0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "buffer_store_byte", VReg_32, i32, truncstorei8_global + mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global >; defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "buffer_store_short", VReg_32, i32, truncstorei16_global + mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "buffer_store_dword", VReg_32, i32, global_store + mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store >; defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store + mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store >; defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store + mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store >; -//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>; + defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < - 0x00000030, "buffer_atomic_swap", VReg_32, i32, atomic_swap_global + mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < - 0x00000032, "buffer_atomic_add", VReg_32, i32, atomic_add_global + mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global >; defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < - 0x00000033, "buffer_atomic_sub", VReg_32, i32, atomic_sub_global + mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global >; -//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < - 0x00000035, "buffer_atomic_smin", VReg_32, i32, atomic_min_global + mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global >; defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < - 0x00000036, "buffer_atomic_umin", VReg_32, i32, atomic_umin_global + mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global >; defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < - 0x00000037, "buffer_atomic_smax", VReg_32, i32, atomic_max_global + mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global >; defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < - 0x00000038, "buffer_atomic_umax", VReg_32, i32, atomic_umax_global + mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global >; defm BUFFER_ATOMIC_AND : MUBUF_Atomic < - 0x00000039, "buffer_atomic_and", VReg_32, i32, atomic_and_global + mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global >; defm BUFFER_ATOMIC_OR : MUBUF_Atomic < - 0x0000003a, "buffer_atomic_or", VReg_32, i32, atomic_or_global + mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global >; defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < - 0x0000003b, "buffer_atomic_xor", VReg_32, i32, atomic_xor_global ->; -//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>; -//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>; -//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>; -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>; -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>; -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>; -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>; -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>; -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>; + mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI +//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>; //===----------------------------------------------------------------------===// // MTBUF Instructions @@ -967,7 +1003,7 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VReg_32>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; @@ -1004,63 +1040,63 @@ defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "image_sample">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; -defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "image_sample_b">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; -defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "image_sample_c">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "image_sample_c_b">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; -defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "image_sample_o">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "image_sample_b_o">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "image_sample_c_o">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; -defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "image_gather4">; -defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; -defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "image_gather4_b">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; -defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "image_gather4_c">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; -defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "image_gather4_c_b">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; -defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "image_gather4_o">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; -defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; -defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "image_gather4_c_o">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "image_get_lod">; +defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; @@ -1077,25 +1113,25 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o" //===----------------------------------------------------------------------===// let Predicates = [HasFlatAddressSpace] in { -def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VReg_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VReg_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VReg_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VReg_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VReg_32>; +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VGPR_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VGPR_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VGPR_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VGPR_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VGPR_32>; def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>; def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>; def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>; def FLAT_STORE_BYTE : FLAT_Store_Helper < - 0x00000018, "flat_store_byte", VReg_32 + 0x00000018, "flat_store_byte", VGPR_32 >; def FLAT_STORE_SHORT : FLAT_Store_Helper < - 0x0000001a, "flat_store_short", VReg_32 + 0x0000001a, "flat_store_short", VGPR_32 >; def FLAT_STORE_DWORD : FLAT_Store_Helper < - 0x0000001c, "flat_store_dword", VReg_32 + 0x0000001c, "flat_store_dword", VGPR_32 >; def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < @@ -1150,7 +1186,9 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < // VOP1 Instructions //===----------------------------------------------------------------------===// -//def V_NOP : VOP1_ <0x00000000, "v_nop", []>; +let vdst = 0, src0 = 0 in { +defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">; +} let isMoveImm = 1 in { defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; @@ -1158,16 +1196,20 @@ defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; let Uses = [EXEC] in { +// FIXME: Specify SchedRW for READFIRSTLANE_B32 + def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), - (ins VReg_32:$src0), + (ins VGPR_32:$src0), "v_readfirstlane_b32 $vdst, $src0", [] >; } +let SchedRW = [WriteQuarterRate32] in { + defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", VOP_I32_F64, fp_to_sint >; @@ -1193,9 +1235,11 @@ defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", VOP_F32_I32, f16_to_fp >; -//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>; -//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>; -//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>; +defm V_CVT_RPI_I32_F32 : VOP1Inst <vop1<0xc>, "v_cvt_rpi_i32_f32", + VOP_I32_F32, cvt_rpi_i32_f32>; +defm V_CVT_FLR_I32_F32 : VOP1Inst <vop1<0xd>, "v_cvt_flr_i32_f32", + VOP_I32_F32, cvt_flr_i32_f32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>; defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", VOP_F32_F64, fround >; @@ -1221,493 +1265,580 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", VOP_F64_I32, uint_to_fp >; -defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32", +} // let SchedRW = [WriteQuarterRate32] + +defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32", VOP_F32_F32, AMDGPUfract >; -defm V_TRUNC_F32 : VOP1Inst <vop1<0x21>, "v_trunc_f32", +defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32", VOP_F32_F32, ftrunc >; -defm V_CEIL_F32 : VOP1Inst <vop1<0x22>, "v_ceil_f32", +defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32", VOP_F32_F32, fceil >; -defm V_RNDNE_F32 : VOP1Inst <vop1<0x23>, "v_rndne_f32", +defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32", VOP_F32_F32, frint >; -defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32", +defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32", VOP_F32_F32, ffloor >; -defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32", +defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32", VOP_F32_F32, fexp2 >; -defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; -defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32", + +let SchedRW = [WriteQuarterRate32] in { + +defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32", VOP_F32_F32, flog2 >; - -defm V_RCP_CLAMP_F32 : VOP1Inst <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; -defm V_RCP_LEGACY_F32 : VOP1Inst <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; -defm V_RCP_F32 : VOP1Inst <vop1<0x2a>, "v_rcp_f32", +defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32", VOP_F32_F32, AMDGPUrcp >; -defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b>, "v_rcp_iflag_f32", VOP_F32_F32>; -defm V_RSQ_CLAMP_F32 : VOP1Inst <vop1<0x2c>, "v_rsq_clamp_f32", - VOP_F32_F32, AMDGPUrsq_clamped +defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32", + VOP_F32_F32 >; -defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32", - VOP_F32_F32, AMDGPUrsq_legacy ->; -defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32", +defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32", VOP_F32_F32, AMDGPUrsq >; -defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64", + +} //let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64", VOP_F64_F64, AMDGPUrcp >; -defm V_RCP_CLAMP_F64 : VOP1Inst <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; -defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64", +defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64", VOP_F64_F64, AMDGPUrsq >; -defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64", - VOP_F64_F64, AMDGPUrsq_clamped ->; -defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32", + +} // let SchedRW = [WriteDouble]; + +defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32", VOP_F32_F32, fsqrt >; -defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64", + +let SchedRW = [WriteDouble] in { + +defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", VOP_F64_F64, fsqrt >; -defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32", + +} // let SchedRW = [WriteDouble] + +defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", VOP_F32_F32, AMDGPUsin >; -defm V_COS_F32 : VOP1Inst <vop1<0x36>, "v_cos_f32", +defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", VOP_F32_F32, AMDGPUcos >; -defm V_NOT_B32 : VOP1Inst <vop1<0x37>, "v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst <vop1<0x38>, "v_bfrev_b32", VOP_I32_I32>; -defm V_FFBH_U32 : VOP1Inst <vop1<0x39>, "v_ffbh_u32", VOP_I32_I32>; -defm V_FFBL_B32 : VOP1Inst <vop1<0x3a>, "v_ffbl_b32", VOP_I32_I32>; -defm V_FFBH_I32 : VOP1Inst <vop1<0x3b>, "v_ffbh_i32", VOP_I32_I32>; -//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>; -defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d>, "v_frexp_mant_f64", VOP_F64_F64>; -defm V_FRACT_F64 : VOP1Inst <vop1<0x3e>, "v_fract_f64", VOP_F64_F64>; -//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>; -defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40>, "v_frexp_mant_f32", VOP_F32_F32>; -//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>; -defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42>, "v_movreld_b32", VOP_I32_I32>; -defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43>, "v_movrels_b32", VOP_I32_I32>; -defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>; +defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", + VOP_I32_F64 +>; +defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", + VOP_F64_F64 +>; +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; +defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", + VOP_I32_F32 +>; +defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", + VOP_F32_F32 +>; +let vdst = 0, src0 = 0 in { +defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [], + "v_clrexcp" +>; +} +defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; +defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; +defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; + +// These instruction only exist on SI and CI +let SubtargetPredicate = isSICI in { + +let SchedRW = [WriteQuarterRate32] in { + +defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; +defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; +defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32", + VOP_F32_F32, AMDGPUrsq_clamped +>; +defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", + VOP_F32_F32, AMDGPUrsq_legacy +>; + +} // End let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", + VOP_F64_F64, AMDGPUrsq_clamped +>; + +} // End SchedRW = [WriteDouble] +} // End SubtargetPredicate = isSICI //===----------------------------------------------------------------------===// // VINTRP Instructions //===----------------------------------------------------------------------===// -def V_INTERP_P1_F32 : VINTRP < - 0x00000000, - (outs VReg_32:$dst), - (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), +// FIXME: Specify SchedRW for VINTRP insturctions. +defm V_INTERP_P1_F32 : VINTRP_m < + 0x00000000, "v_interp_p1_f32", + (outs VGPR_32:$dst), + (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]", - []> { - let DisableEncoding = "$m0"; -} + "$m0">; -def V_INTERP_P2_F32 : VINTRP < - 0x00000001, - (outs VReg_32:$dst), - (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), +defm V_INTERP_P2_F32 : VINTRP_m < + 0x00000001, "v_interp_p2_f32", + (outs VGPR_32:$dst), + (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", - []> { - - let Constraints = "$src0 = $dst"; - let DisableEncoding = "$src0,$m0"; + "$src0,$m0", + "$src0 = $dst">; -} - -def V_INTERP_MOV_F32 : VINTRP < - 0x00000002, - (outs VReg_32:$dst), +defm V_INTERP_MOV_F32 : VINTRP_m < + 0x00000002, "v_interp_mov_f32", + (outs VGPR_32:$dst), (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]", - []> { - let DisableEncoding = "$m0"; -} + "$m0">; //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// -def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), - (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), - "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]", - [] ->{ - let DisableEncoding = "$vcc"; -} - -def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), +defm V_CNDMASK_B32_e64 : VOP3_m_nomods <vop3<0x100>, (outs VGPR_32:$dst), (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2), "v_cndmask_b32_e64 $dst, $src0, $src1, $src2", - [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))] -> { - let src0_modifiers = 0; - let src1_modifiers = 0; - let src2_modifiers = 0; -} - -def V_READLANE_B32 : VOP2 < - 0x00000001, - (outs SReg_32:$vdst), - (ins VReg_32:$src0, SSrc_32:$vsrc1), - "v_readlane_b32 $vdst, $src0, $vsrc1", - [] + [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))], + "v_cndmask_b32_e64", 3 >; -def V_WRITELANE_B32 : VOP2 < - 0x00000002, - (outs VReg_32:$vdst), - (ins SReg_32:$src0, SSrc_32:$vsrc1), - "v_writelane_b32 $vdst, $src0, $vsrc1", - [] ->; let isCommutable = 1 in { -defm V_ADD_F32 : VOP2Inst <vop2<0x3>, "v_add_f32", +defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", VOP_F32_F32_F32, fadd >; -defm V_SUB_F32 : VOP2Inst <vop2<0x4>, "v_sub_f32", VOP_F32_F32_F32, fsub>; -defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32", +defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32" >; } // End isCommutable = 1 let isCommutable = 1 in { -defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32", - VOP_F32_F32_F32 ->; - -defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32", +defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32", VOP_F32_F32_F32, int_AMDGPU_mul >; -defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32", +defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", VOP_F32_F32_F32, fmul >; -defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24", +defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24 >; -//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>; -defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24", - VOP_I32_I32_I32, AMDGPUmul_u24 ->; -//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>; - -defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmin_legacy +defm V_MUL_HI_I32_I24 : VOP2Inst <vop2<0xa,0x7>, "v_mul_hi_i32_i24", + VOP_I32_I32_I32 >; -defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmax_legacy +defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24", + VOP_I32_I32_I32, AMDGPUmul_u24 >; -defm V_MIN_F32 : VOP2Inst <vop2<0xf>, "v_min_f32", VOP_F32_F32_F32, fminnum>; -defm V_MAX_F32 : VOP2Inst <vop2<0x10>, "v_max_f32", VOP_F32_F32_F32, fmaxnum>; -defm V_MIN_I32 : VOP2Inst <vop2<0x11>, "v_min_i32", VOP_I32_I32_I32, AMDGPUsmin>; -defm V_MAX_I32 : VOP2Inst <vop2<0x12>, "v_max_i32", VOP_I32_I32_I32, AMDGPUsmax>; -defm V_MIN_U32 : VOP2Inst <vop2<0x13>, "v_min_u32", VOP_I32_I32_I32, AMDGPUumin>; -defm V_MAX_U32 : VOP2Inst <vop2<0x14>, "v_max_u32", VOP_I32_I32_I32, AMDGPUumax>; +defm V_MUL_HI_U32_U24 : VOP2Inst <vop2<0xc,0x9>, "v_mul_hi_u32_u24", + VOP_I32_I32_I32 +>; -defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>; +defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, + fminnum>; +defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32, + fmaxnum>; +defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>; +defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>; +defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>; +defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>; defm V_LSHRREV_B32 : VOP2Inst < - vop2<0x16>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32" + vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshr_b32" >; -defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32", - VOP_I32_I32_I32, sra ->; defm V_ASHRREV_I32 : VOP2Inst < - vop2<0x18>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32" + vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, + "v_ashr_i32" >; -let hasPostISelHook = 1 in { - -defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>; - -} defm V_LSHLREV_B32 : VOP2Inst < - vop2<0x1a>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32" + vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshl_b32" >; -defm V_AND_B32 : VOP2Inst <vop2<0x1b>, "v_and_b32", - VOP_I32_I32_I32, and>; -defm V_OR_B32 : VOP2Inst <vop2<0x1c>, "v_or_b32", - VOP_I32_I32_I32, or ->; -defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32", - VOP_I32_I32_I32, xor ->; - -} // End isCommutable = 1 - -defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", - VOP_I32_I32_I32, AMDGPUbfm>; +defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; +defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; +defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; -let isCommutable = 1 in { -defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>; +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; } // End isCommutable = 1 -defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>; +defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>; +defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; } // End isCommutable = 1 - -defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>; -defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32", - - VOP_I32_I32_I32 ->; -defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 ->; - let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. -defm V_ADD_I32 : VOP2bInst <vop2<0x25>, "v_add_i32", + +// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, +// but the VI instructions behave the same as the SI versions. +defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", VOP_I32_I32_I32, add >; -defm V_SUB_I32 : VOP2bInst <vop2<0x26>, "v_sub_i32", - VOP_I32_I32_I32, sub ->; -defm V_SUBREV_I32 : VOP2bInst <vop2<0x27>, "v_subrev_i32", +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>; + +defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", VOP_I32_I32_I32, null_frag, "v_sub_i32" >; let Uses = [VCC] in { // Carry-in comes from VCC -defm V_ADDC_U32 : VOP2bInst <vop2<0x28>, "v_addc_u32", - VOP_I32_I32_I32_VCC, adde +defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", + VOP_I32_I32_I32_VCC >; -defm V_SUBB_U32 : VOP2bInst <vop2<0x29>, "v_subb_u32", - VOP_I32_I32_I32_VCC, sube +defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", + VOP_I32_I32_I32_VCC >; -defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a>, "v_subbrev_u32", +defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" >; } // End Uses = [VCC] } // End isCommutable = 1, Defs = [VCC] -defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32", +defm V_READLANE_B32 : VOP2SI_3VI_m < + vop3 <0x001, 0x289>, + "v_readlane_b32", + (outs SReg_32:$vdst), + (ins VGPR_32:$src0, SCSrc_32:$src1), + "v_readlane_b32 $vdst, $src0, $src1" +>; + +defm V_WRITELANE_B32 : VOP2SI_3VI_m < + vop3 <0x002, 0x28a>, + "v_writelane_b32", + (outs VGPR_32:$vdst), + (ins SReg_32:$src0, SCSrc_32:$src1), + "v_writelane_b32 $vdst, $src0, $src1" +>; + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmin_legacy +>; +defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmax_legacy +>; + +let isCommutable = 1 in { +defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>; +} // End isCommutable = 1 +} // End let SubtargetPredicate = SICI + +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32", + VOP_F32_F32_F32 +>; +} // End isCommutable = 1 + +defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32, + AMDGPUbfm +>; +defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32 +>; +defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp >; -////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>; -////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>; -////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32", - VOP_I32_F32_F32, int_SI_packf16 + + +defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32", + VOP_I32_F32_I32>; // TODO: set "Uses = dst" + +defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 +>; +defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32", + VOP_I32_I32_I32 +>; +defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32", + VOP_I32_I32_I32 >; -////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>; -////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>; //===----------------------------------------------------------------------===// // VOP3 Instructions //===----------------------------------------------------------------------===// let isCommutable = 1 in { -defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32", +defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32", VOP_F32_F32_F32_F32 >; -defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32", +defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32", VOP_F32_F32_F32_F32, fmad >; -defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24", +defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24", VOP_I32_I32_I32_I32, AMDGPUmad_i24 >; -defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24", +defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24", VOP_I32_I32_I32_I32, AMDGPUmad_u24 >; } // End isCommutable = 1 -defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32", +defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32", VOP_F32_F32_F32_F32 >; -defm V_CUBESC_F32 : VOP3Inst <vop3<0x145>, "v_cubesc_f32", +defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32", VOP_F32_F32_F32_F32 >; -defm V_CUBETC_F32 : VOP3Inst <vop3<0x146>, "v_cubetc_f32", +defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32", VOP_F32_F32_F32_F32 >; -defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147>, "v_cubema_f32", +defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", VOP_F32_F32_F32_F32 >; -defm V_BFE_U32 : VOP3Inst <vop3<0x148>, "v_bfe_u32", + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", VOP_I32_I32_I32_I32, AMDGPUbfe_u32 >; -defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32", +defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", VOP_I32_I32_I32_I32, AMDGPUbfe_i32 >; -defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32", +} + +defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", VOP_I32_I32_I32_I32, AMDGPUbfi >; let isCommutable = 1 in { -defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32", +defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32", VOP_F32_F32_F32_F32, fma >; -defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64", +defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64", VOP_F64_F64_F64_F64, fma >; } // End isCommutable = 1 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; -defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32", +defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32", VOP_I32_I32_I32_I32 >; -defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f>, "v_alignbyte_b32", +defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", VOP_I32_I32_I32_I32 >; -defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", - VOP_F32_F32_F32_F32>; -defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32", + +defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32", VOP_F32_F32_F32_F32, AMDGPUfmin3>; -defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32", +defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32", VOP_I32_I32_I32_I32, AMDGPUsmin3 >; -defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32", +defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32", VOP_I32_I32_I32_I32, AMDGPUumin3 >; -defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32", +defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32", VOP_F32_F32_F32_F32, AMDGPUfmax3 >; -defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32", +defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32", VOP_I32_I32_I32_I32, AMDGPUsmax3 >; -defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32", +defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; -//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>; -//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>; -//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>; +defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", + VOP_F32_F32_F32_F32 +>; +defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", + VOP_I32_I32_I32_I32 +>; +defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", + VOP_I32_I32_I32_I32 +>; + //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; -defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32", +defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32", VOP_I32_I32_I32_I32 >; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; defm V_DIV_FIXUP_F32 : VOP3Inst < - vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup + vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; + +let SchedRW = [WriteDouble] in { + defm V_DIV_FIXUP_F64 : VOP3Inst < - vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup + vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; -defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", - VOP_I64_I64_I32, shl ->; -defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", - VOP_I64_I64_I32, srl ->; -defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", - VOP_I64_I64_I32, sra ->; +} // let SchedRW = [WriteDouble] +let SchedRW = [WriteDouble] in { let isCommutable = 1 in { -defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64", +defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", VOP_F64_F64_F64, fadd >; -defm V_MUL_F64 : VOP3Inst <vop3<0x165>, "v_mul_f64", +defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64", VOP_F64_F64_F64, fmul >; -defm V_MIN_F64 : VOP3Inst <vop3<0x166>, "v_min_f64", +defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64", VOP_F64_F64_F64, fminnum >; -defm V_MAX_F64 : VOP3Inst <vop3<0x167>, "v_max_f64", +defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64", VOP_F64_F64_F64, fmaxnum >; } // isCommutable = 1 -defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64", +defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp >; -let isCommutable = 1 in { +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { -defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32", +defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32", VOP_I32_I32_I32 >; -defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a>, "v_mul_hi_u32", +defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32", VOP_I32_I32_I32 >; -defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b>, "v_mul_lo_i32", + +defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32", VOP_I32_I32_I32 >; -defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32", +defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", VOP_I32_I32_I32 >; -} // isCommutable = 1 +} // isCommutable = 1, SchedRW = [WriteQuarterRate32] -defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>; +let SchedRW = [WriteFloatFMA, WriteSALU] in { +defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +} +let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>; +defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; +} // let SchedRW = [WriteDouble] -let isCommutable = 1 in { -defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32", +let isCommutable = 1, Uses = [VCC] in { + +// v_div_fmas_f32: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^32 +// +defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; -defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64", + +let SchedRW = [WriteDouble] in { +// v_div_fmas_f64: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^64 +// +defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; + +} // End SchedRW = [WriteDouble] } // End isCommutable = 1 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; +let SchedRW = [WriteDouble] in { defm V_TRIG_PREOP_F64 : VOP3Inst < - vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop + vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; -//===----------------------------------------------------------------------===// -// Pseudo Instructions -//===----------------------------------------------------------------------===// +} // let SchedRW = [WriteDouble] -let isCodeGenOnly = 1, isPseudo = 1 in { +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { -def V_MOV_I1 : InstSI < - (outs VReg_1:$dst), - (ins i1imm:$src), - "", [(set i1:$dst, (imm:$src))] ->; +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>; +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>; +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>; -def V_AND_I1 : InstSI < - (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", - [(set i1:$dst, (and i1:$src0, i1:$src1))] ->; +defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", + VOP_F32_F32_F32_F32>; -def V_OR_I1 : InstSI < - (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", - [(set i1:$dst, (or i1:$src0, i1:$src1))] ->; +} // End SubtargetPredicate = isSICI + +let SubtargetPredicate = isVI in { -def V_XOR_I1 : InstSI < - (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", - [(set i1:$dst, (xor i1:$src0, i1:$src1))] +defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64", + VOP_I64_I32_I64 +>; +defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64", + VOP_I64_I32_I64 +>; +defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", + VOP_I64_I32_I64 >; +} // End SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// Pseudo Instructions +//===----------------------------------------------------------------------===// +let isCodeGenOnly = 1, isPseudo = 1 in { + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +// 64-bit vector move instruction. This is mainly used by the SIFoldOperands +// pass to enable folding of inline immediates. +def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 + let hasSideEffects = 1 in { def SGPR_USE : InstSI <(outs),(ins), "", []>; } @@ -1785,12 +1916,12 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>; +//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>; let UseNamedOperandTable = 1 in { def SI_RegisterLoad : InstSI < - (outs VReg_32:$dst, SReg_64:$temp), + (outs VGPR_32:$dst, SReg_64:$temp), (ins FRAMEri32:$addr, i32imm:$chan), "", [] > { @@ -1800,7 +1931,7 @@ def SI_RegisterLoad : InstSI < class SIRegStore<dag outs> : InstSI < outs, - (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan), + (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), "", [] > { let isRegisterStore = 1; @@ -1816,7 +1947,7 @@ def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; } // End UseNamedOperandTable = 1 def SI_INDIRECT_SRC : InstSI < - (outs VReg_32:$dst, SReg_64:$temp), + (outs VGPR_32:$dst, SReg_64:$temp), (ins unknown:$src, VSrc_32:$idx, i32imm:$off), "si_indirect_src $dst, $temp, $src, $idx, $off", [] @@ -1824,14 +1955,14 @@ def SI_INDIRECT_SRC : InstSI < class SI_INDIRECT_DST<RegisterClass rc> : InstSI < (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", [] > { let Constraints = "$src = $dst"; } -def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>; +def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; @@ -1839,31 +1970,22 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; } // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] -let usesCustomInserter = 1 in { - -def V_SUB_F64 : InstSI < - (outs VReg_64:$dst), - (ins VReg_64:$src0, VReg_64:$src1), - "v_sub_f64 $dst, $src0, $src1", - [(set f64:$dst, (fsub f64:$src0, f64:$src1))] ->; - -} // end usesCustomInserter - multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - def _SAVE : InstSI < - (outs), - (ins sgpr_class:$src, i32imm:$frame_idx), - "", [] - >; - - def _RESTORE : InstSI < - (outs sgpr_class:$dst), - (ins i32imm:$frame_idx), - "", [] - >; - + let UseNamedOperandTable = 1 in { + def _SAVE : InstSI < + (outs), + (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs sgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1 } defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; @@ -1873,20 +1995,23 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { - def _SAVE : InstSI < - (outs), - (ins vgpr_class:$src, i32imm:$frame_idx), - "", [] - >; - - def _RESTORE : InstSI < - (outs vgpr_class:$dst), - (ins i32imm:$frame_idx), - "", [] - >; + let UseNamedOperandTable = 1 in { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1 } -defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>; +defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; @@ -1905,9 +2030,9 @@ def SI_CONSTDATA_PTR : InstSI < } // end IsCodeGenOnly, isPseudo -} // end SubtargetPredicate = SI +} // end SubtargetPredicate = isGCN -let Predicates = [isSI] in { +let Predicates = [isGCN] in { def : Pat< (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), @@ -1941,7 +2066,7 @@ def : Pat < multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { - // 1. Offset as 8bit DWORD immediate + // 1. SI-CI: Offset as 8bit DWORD immediate def : Pat < (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) @@ -1960,6 +2085,28 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { >; } +multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + + // 1. VI: Offset as 20bit immediate in bytes + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), + (vt (Instr_IMM $sbase, (as_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +let Predicates = [isSICI] in { defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; @@ -1967,6 +2114,19 @@ defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; +} // End Predicates = [isSICI] + +let Predicates = [isVI] in { +defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; +} // End Predicates = [isVI] + +let Predicates = [isSICI] in { // 1. Offset as 8bit DWORD immediate def : Pat < @@ -1974,14 +2134,14 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) >; +} // End Predicates = [isSICI] + // 2. Offset loaded in an 32bit SGPR def : Pat < (SIload_constant v4i32:$sbase, imm:$offset), (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; -} // Predicates = [isSI] in { - //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -2004,8 +2164,6 @@ def : Pat < (S_ADD_U32 $src0, $src1) >; -let Predicates = [isSI] in { - //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// @@ -2020,9 +2178,13 @@ def : Pat < //===----------------------------------------------------------------------===// let Predicates = [UnsafeFPMath] in { -def : RcpPat<V_RCP_F64_e32, f64>; -defm : RsqPat<V_RSQ_F64_e32, f64>; -defm : RsqPat<V_RSQ_F32_e32, f32>; + +//def : RcpPat<V_RCP_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F64_e32, f64>; +//defm : RsqPat<V_RSQ_F32_e32, f32>; + +def : RsqPat<V_RSQ_F32_e32, f32>; +def : RsqPat<V_RSQ_F64_e32, f64>; } //===----------------------------------------------------------------------===// @@ -2369,10 +2531,10 @@ foreach Index = 0-15 in { } def : BitConvert <i32, f32, SReg_32>; -def : BitConvert <i32, f32, VReg_32>; +def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, SReg_32>; -def : BitConvert <f32, i32, VReg_32>; +def : BitConvert <f32, i32, VGPR_32>; def : BitConvert <i64, f64, VReg_64>; @@ -2475,7 +2637,7 @@ def : Pat < def : Pat < (SGPRImm<(f32 fpimm)>:$imm), - (S_MOV_B32 fpimm:$imm) + (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < @@ -2485,7 +2647,7 @@ def : Pat < def : Pat < (f32 fpimm:$imm), - (V_MOV_B32_e32 fpimm:$imm) + (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < @@ -2493,21 +2655,38 @@ def : Pat < (S_MOV_B64 InlineImm<i64>:$imm) >; +// XXX - Should this use a s_cmp to set SCC? + +// Set to sign-extended 64-bit value (true = -1, false = 0) +def : Pat < + (i1 imm:$imm), + (S_MOV_B64 (i64 (as_i64imm $imm))) +>; + +def : Pat < + (f64 InlineFPImm<f64>:$imm), + (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm))) +>; + /********** ===================== **********/ /********** Interpolation Paterns **********/ /********** ===================== **********/ +// The value of $params is constant through out the entire kernel. +// We need to use S_MOV_B32 $params, because CSE ignores copies, so +// without it we end up with a lot of redundant moves. + def : Pat < (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params), - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params) + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)) >; def : Pat < - (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij), + (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij), (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0), - imm:$attr_chan, imm:$attr, i32:$params), + imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)), (EXTRACT_SUBREG $ij, sub1), - imm:$attr_chan, imm:$attr, $params) + imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)) >; /********** ================== **********/ @@ -2522,13 +2701,6 @@ def : Pat < (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; -def : Pat< - (fdiv f64:$src0, f64:$src1), - (V_MUL_F64 0 /* src0_modifiers */, $src0, - 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1), - 0 /* clamp */, 0 /* omod */) ->; - def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, @@ -2579,7 +2751,7 @@ def : Pat < def : Pat < (int_SI_tid), - (V_MBCNT_HI_U32_B32_e32 0xffffffff, + (V_MBCNT_HI_U32_B32_e64 0xffffffff, (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) >; @@ -2600,9 +2772,6 @@ def : Pat < (V_MUL_HI_I32 $src0, $src1) >; -def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>; - - defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; @@ -2612,7 +2781,7 @@ def : ROTRPattern <V_ALIGNBIT_B32>; class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst (i1 0), $ptr, (as_i16imm $offset)) + (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1)) >; def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; @@ -2630,12 +2799,12 @@ def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>; def : Pat < (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), - (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1) + (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1)) >; class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) + (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) >; def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; @@ -2651,12 +2820,13 @@ def : Pat < (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1) + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (S_MOV_B32 -1)) >; class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) + (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) >; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec @@ -2672,13 +2842,13 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < class DSAtomicIncRetPat<DS inst, ValueType vt, Instruction LoadImm, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) + (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1)) >; class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) + (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1)) >; @@ -2728,11 +2898,12 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, PatFrag constant_ld> { def : Pat < - (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))), - (Instr_ADDR64 $srsrc, $vaddr, $offset) + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), + (Instr_ADDR64 $srsrc, $vaddr, $soffset, $offset) >; } +let Predicates = [isSICI] in { defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; @@ -2740,6 +2911,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; +} // End Predicates = [isSICI] class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, @@ -2785,9 +2957,9 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm, 1, 1, imm:$glc, imm:$slc, + imm:$offset, 1, 1, imm:$glc, imm:$slc, imm:$tfe)), - (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc), + (bothen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; } @@ -2817,11 +2989,13 @@ class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (Instr $value, $srsrc, $vaddr, $offset) >; +let Predicates = [isSICI] in { def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>; def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>; def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>; def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>; def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>; +} // End Predicates = [isSICI] */ @@ -2848,20 +3022,6 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; let SubtargetPredicate = isCI in { -// Sea island new arithmetic instructinos -defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", - VOP_F64_F64, ftrunc ->; -defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", - VOP_F64_F64, fceil ->; -defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", - VOP_F64_F64, ffloor ->; -defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", - VOP_F64_F64, frint ->; - defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", VOP_I32_I32_I32 >; @@ -2890,8 +3050,6 @@ defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", // S_CBRANCH_CDBGSYS_OR_USER // S_CBRANCH_CDBGSYS_AND_USER // S_DCACHE_INV_VOL -// V_EXP_LEGACY_F32 -// V_LOG_LEGACY_F32 // DS_NOP // DS_GWS_SEMA_RELEASE_ALL // DS_WRAP_RTN_B32 @@ -2904,7 +3062,7 @@ defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", // BUFFER_LOAD_DWORDX3 // BUFFER_STORE_DWORDX3 -} // End iSCI +} // End isCI //===----------------------------------------------------------------------===// // Flat Patterns @@ -3038,6 +3196,27 @@ def : Pat < (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; +// If we need to perform a logical operation on i1 values, we need to +// use vector comparisons since there is only one SCC register. Vector +// comparisions still write to a pair of SGPRs, so treat these as +// 64-bit comparisons. When legalizing SGPR copies, instructions +// resulting in the copies from SCC to these instructions will be +// moved to the VALU. +def : Pat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) +>; + +def : Pat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) +>; + +def : Pat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + def : Pat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) @@ -3050,7 +3229,7 @@ def : Pat < def : Pat < (f64 (sint_to_fp i1:$src)), - (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) >; def : Pat < @@ -3073,16 +3252,27 @@ def : Pat < >; def : Pat < + (i1 (trunc i64:$a)), + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (EXTRACT_SUBREG $a, sub0)), 1) +>; + +def : Pat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 0x00ff00ff), (V_ALIGNBIT_B32 $a, $a, 24), (V_ALIGNBIT_B32 $a, $a, 8)) >; +def : Pat < + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) +>; + //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; -} // End isSI predicate +} // End isGCN predicate diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp index 4140196..46630d0 100644 --- a/lib/Target/R600/SILoadStoreOptimizer.cpp +++ b/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -55,7 +55,6 @@ namespace { class SILoadStoreOptimizer : public MachineFunctionPass { private: - const TargetMachine *TM; const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; @@ -86,20 +85,11 @@ private: public: static char ID; - SILoadStoreOptimizer() : - MachineFunctionPass(ID), - TM(nullptr), - TII(nullptr), - TRI(nullptr), - MRI(nullptr), - LIS(nullptr) { + SILoadStoreOptimizer() + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), + LIS(nullptr) {} - } - - SILoadStoreOptimizer(const TargetMachine &TM_) : - MachineFunctionPass(ID), - TM(&TM_), - TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) { + SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } @@ -222,6 +212,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); unsigned DestReg1 @@ -262,6 +253,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 + .addOperand(*M0Reg) // M0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); @@ -280,6 +272,18 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); + LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg()); + LIS->shrinkToUses(&M0RegLI); + + // Currently m0 is treated as a register class with one member instead of an + // implicit physical register. We are using the virtual register for the first + // one, but we still need to update the live range of the now unused second m0 + // virtual register to avoid verifier errors. + const MachineOperand *PairedM0Reg + = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0); + LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg()); + LIS->shrinkToUses(&PairedM0RegLI); + LIS->getInterval(DestReg); // Create new LI DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); @@ -295,6 +299,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); @@ -333,11 +338,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( .addOperand(*Data1) // data1 .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 + .addOperand(*M0Reg) // m0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(), + M0Reg->getReg()}; LIS->RemoveMachineInstrFromMaps(I); LIS->RemoveMachineInstrFromMaps(Paired); @@ -397,9 +404,9 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl(); - TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo()); - TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo()); + const TargetSubtargetInfo &STM = MF.getSubtarget(); + TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo()); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 9702565..2e08c9f 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -88,7 +88,6 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void InitM0ForLDS(MachineBasicBlock::iterator MI); void LoadM0(MachineInstr &MI, MachineInstr *MovRel); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -309,10 +308,9 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { #endif // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm() || Op.isFPImm())) { + if ((Op.isImm())) { // Constant operand: Set exec mask to 0 or do nothing - if (Op.isImm() ? (Op.getImm() & 0x80000000) : - Op.getFPImm()->isNegative()) { + if (Op.getImm() & 0x80000000) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) .addImm(0); } @@ -325,14 +323,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -/// The m0 register stores the maximum allowable address for LDS reads and -/// writes. Its value must be at least the size in bytes of LDS allocated by -/// the shader. For simplicity, we set it to the maximum possible value. -void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::M0).addImm(0xffffffff); -} - void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); @@ -349,7 +339,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { } else { assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VReg_32RegClass.contains(Idx)); + assert(AMDGPU::VGPR_32RegClass.contains(Idx)); // Save the EXEC mask BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) @@ -391,12 +381,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { .addReg(Save); } - // FIXME: Are there any values other than the LDS address clamp that need to - // be stored in the m0 register and may be live for more than a few - // instructions? If so, we should save the m0 register at the beginning - // of this function and restore it here. - // FIXME: Add support for LDS direct loads. - InitM0ForLDS(&MI); MI.eraseFromParent(); } @@ -450,7 +434,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; - bool NeedM0 = false; bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; @@ -464,16 +447,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isDS(MI.getOpcode())) { - NeedM0 = true; + if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) NeedWQM = true; - } // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI.getOpcode())) { - NeedM0 = true; + if (TII->isFLAT(MI.getOpcode())) NeedFlat = true; - } switch (MI.getOpcode()) { default: break; @@ -534,23 +513,10 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; - - case AMDGPU::V_INTERP_P1_F32: - case AMDGPU::V_INTERP_P2_F32: - case AMDGPU::V_INTERP_MOV_F32: - NeedWQM = true; - break; } } } - if (NeedM0) { - MachineBasicBlock &MBB = MF.front(); - // Initialize M0 to a value that won't cause LDS access to be discarded - // due to offset clamping - InitM0ForLDS(MBB.getFirstNonPHI()); - } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp index 65b892c..67421e2 100644 --- a/lib/Target/R600/SILowerI1Copies.cpp +++ b/lib/Target/R600/SILowerI1Copies.cpp @@ -85,30 +85,6 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::V_MOV_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); - continue; - } - - if (MI.getOpcode() == AMDGPU::V_AND_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32)); - continue; - } - - if (MI.getOpcode() == AMDGPU::V_OR_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32)); - continue; - } - - if (MI.getOpcode() == AMDGPU::V_XOR_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32)); - continue; - } - if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { unsigned Reg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); @@ -117,39 +93,59 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { continue; } - if (MI.getOpcode() != AMDGPU::COPY || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg())) + if (MI.getOpcode() != AMDGPU::COPY) continue; + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); + + if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; - const TargetRegisterClass *DstRC = - MRI.getRegClass(MI.getOperand(0).getReg()); - const TargetRegisterClass *SrcRC = - MRI.getRegClass(MI.getOperand(1).getReg()); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); + const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); if (DstRC == &AMDGPU::VReg_1RegClass && TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(MI.getOperand(0).getReg()); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .addOperand(MI.getOperand(0)) - .addImm(0) - .addImm(-1) - .addOperand(MI.getOperand(1)); + I1Defs.push_back(Dst.getReg()); + DebugLoc DL = MI.getDebugLoc(); + + MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); + if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { + if (DefInst->getOperand(1).isImm()) { + I1Defs.push_back(Dst.getReg()); + + int64_t Val = DefInst->getOperand(1).getImm(); + assert(Val == 0 || Val == -1); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) + .addOperand(Dst) + .addImm(Val); + MI.eraseFromParent(); + continue; + } + } + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addOperand(Dst) + .addImm(0) + .addImm(-1) + .addOperand(Src); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addImm(0); + .addOperand(Dst) + .addOperand(Src) + .addImm(0); MI.eraseFromParent(); } } } for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass); + MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); return false; } diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index d58f31d..587ea63 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -29,6 +29,7 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), + HasSpilledVGPRs(false), PSInputAddr(0), NumUserSGPRs(0), LDSWaveSpillSize(0) { } @@ -38,8 +39,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned FrameIndex, unsigned SubIdx) { const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>( - MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); MachineRegisterInfo &MRI = MF->getRegInfo(); int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); Offset += SubIdx * 4; @@ -50,7 +51,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( struct SpilledReg Spill; if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedVGPR(MRI); + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; MRI.setPhysRegUsed(LaneVGPR); @@ -69,7 +70,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should get this information from kernel attributes if it // is available. return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 6bb8f9d..667da4c 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -29,6 +29,7 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { void anchor() override; unsigned TIDReg; + bool HasSpilledVGPRs; public: @@ -49,9 +50,12 @@ public: unsigned NumUserSGPRs; std::map<unsigned, unsigned> LaneVGPRs; unsigned LDSWaveSpillSize; + unsigned ScratchOffsetReg; bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } + bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } + void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp new file mode 100644 index 0000000..0a57a5b --- /dev/null +++ b/lib/Target/R600/SIPrepareScratchRegs.cpp @@ -0,0 +1,208 @@ +//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This pass loads scratch pointer and scratch offset into a register or a +/// frame index which can be used anywhere in the program. These values will +/// be used for spilling VGPRs. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +namespace { + +class SIPrepareScratchRegs : public MachineFunctionPass { + +private: + static char ID; + +public: + SIPrepareScratchRegs() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI prepare scratch registers"; + } + +}; + +} // End anonymous namespace + +char SIPrepareScratchRegs::ID = 0; + +FunctionPass *llvm::createSIPrepareScratchRegs() { + return new SIPrepareScratchRegs(); +} + +bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock::iterator I = Entry->begin(); + DebugLoc DL = I->getDebugLoc(); + + // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to + // run this pass. + if (!MFI->hasSpilledVGPRs()) + return false; + + unsigned ScratchPtrPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + + if (!Entry->isLiveIn(ScratchPtrPreloadReg)) + Entry->addLiveIn(ScratchPtrPreloadReg); + + if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) + Entry->addLiveIn(ScratchOffsetPreloadReg); + + // Load the scratch offset. + unsigned ScratchOffsetReg = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); + int ScratchOffsetFI = -1; + + if (ScratchOffsetReg != AMDGPU::NoRegister) { + // Found an SGPR to use + MRI.setPhysRegUsed(ScratchOffsetReg); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) + .addReg(ScratchOffsetPreloadReg); + } else { + // No SGPR is available, we must spill. + ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) + .addReg(ScratchOffsetPreloadReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } + + + // Now that we have the scratch pointer and offset values, we need to + // add them to all the SI_SPILL_V* instructions. + + RegScavenger RS; + unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); + RS.addScavengingFrameIndex(ScratchRsrcFI); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + // Add the scratch offset reg as a live-in so that the register scavenger + // doesn't re-use it. + if (!MBB.isLiveIn(ScratchOffsetReg) && + ScratchOffsetReg != AMDGPU::NoRegister) + MBB.addLiveIn(ScratchOffsetReg); + RS.enterBasicBlock(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + RS.forward(I); + DebugLoc DL = MI.getDebugLoc(); + switch(MI.getOpcode()) { + default: break; + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: + + // Scratch resource + unsigned ScratchRsrcReg = + RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + // Scratch Offset + if (ScratchOffsetReg == AMDGPU::NoRegister) { + ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), + ScratchOffsetReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else if (!MBB.isLiveIn(ScratchOffsetReg)) { + MBB.addLiveIn(ScratchOffsetReg); + } + + if (ScratchRsrcReg == AMDGPU::NoRegister || + ScratchOffsetReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("ran out of SGPRs for spilling VGPRs"); + ScratchRsrcReg = AMDGPU::SGPR0; + ScratchOffsetReg = AMDGPU::SGPR0; + } + MI.getOperand(2).setReg(ScratchRsrcReg); + MI.getOperand(2).setIsKill(true); + MI.getOperand(2).setIsUndef(false); + MI.getOperand(3).setReg(ScratchOffsetReg); + MI.getOperand(3).setIsUndef(false); + MI.getOperand(3).setIsKill(false); + MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); + + break; + } + } + } + return true; +} diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index cffea12..9224e14 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -40,6 +40,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); Reserved.set(AMDGPU::FLAT_SCR); + Reserved.set(AMDGPU::FLAT_SCR_LO); + Reserved.set(AMDGPU::FLAT_SCR_HI); // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs Reserved.set(AMDGPU::VGPR255); @@ -48,9 +50,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - return RC->getNumRegs(); +unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { + + // FIXME: We should adjust the max number of waves based on LDS size. + unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU()); + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + + unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned Limit; + + if (isSGPRClass(*I)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(*I); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { @@ -92,6 +117,60 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { } } +void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + unsigned Value, + unsigned ScratchRsrcReg, + unsigned ScratchOffset, + int64_t Offset, + RegScavenger *RS) const { + + const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); + MachineBasicBlock *MBB = MI->getParent(); + const MachineFunction *MF = MI->getParent()->getParent(); + LLVMContext &Ctx = MF->getFunction()->getContext(); + DebugLoc DL = MI->getDebugLoc(); + bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + + bool RanOutOfSGPRs = false; + unsigned SOffset = ScratchOffset; + + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned Size = NumSubRegs * 4; + + if (!isUInt<12>(Offset + Size)) { + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + if (SOffset == AMDGPU::NoRegister) { + RanOutOfSGPRs = true; + SOffset = AMDGPU::SGPR0; + } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffset) + .addImm(Offset); + Offset = 0; + } + + if (RanOutOfSGPRs) + Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : + Value; + bool IsKill = (i == e - 1); + + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addImm(Offset) + .addReg(SOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + } +} + void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -125,7 +204,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) .addReg(SubReg) .addImm(Spill.Lane); @@ -154,13 +235,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } - if (isM0) { + if (isM0) SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); - } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) .addReg(Spill.VGPR) - .addImm(Spill.Lane); + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); if (isM0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addReg(SubReg); @@ -177,71 +260,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned SrcReg = MI->getOperand(0).getReg(); - int64_t Offset = FrameInfo->getObjectOffset(Index); - unsigned Size = NumSubRegs * 4; - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { - unsigned SubReg = NumSubRegs > 1 ? - getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) : - SrcReg; - Offset += (i * 4); - MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize); - - unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, - Offset, Size); - - if (AddrReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling VGPRS"); - AddrReg = AMDGPU::VGPR0; - } - - // Store the value in LDS - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32)) - .addImm(0) // gds - .addReg(AddrReg, RegState::Kill) // addr - .addReg(SubReg) // data0 - .addImm(0); // offset - } - + case AMDGPU::SI_SPILL_V32_SAVE: + buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); MI->eraseFromParent(); break; - } case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned DstReg = MI->getOperand(0).getReg(); - int64_t Offset = FrameInfo->getObjectOffset(Index); - unsigned Size = NumSubRegs * 4; - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - // FIXME: We could use DS_READ_B64 here to optimize for larger registers. - for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { - unsigned SubReg = NumSubRegs > 1 ? - getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) : - DstReg; - - Offset += (i * 4); - unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, - Offset, Size); - if (AddrReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling VGPRs"); - AddrReg = AMDGPU::VGPR0; - } - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg) - .addImm(0) // gds - .addReg(AddrReg, RegState::Kill) // addr - .addImm(0); //offset - } + buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); MI->eraseFromParent(); break; } @@ -250,11 +287,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj); + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); - FIOp.ChangeToRegister(TmpReg, false); + FIOp.ChangeToRegister(TmpReg, false, false, true); } } } @@ -264,7 +301,7 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { default: - case MVT::i32: return &AMDGPU::VReg_32RegClass; + case MVT::i32: return &AMDGPU::VGPR_32RegClass; } } @@ -276,7 +313,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); static const TargetRegisterClass *BaseClasses[] = { - &AMDGPU::VReg_32RegClass, + &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, @@ -297,7 +334,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { } bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) || + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || @@ -312,7 +349,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( } else if (SRC == &AMDGPU::SCCRegRegClass) { return &AMDGPU::VCCRegRegClass; } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VReg_32RegClass; + return &AMDGPU::VGPR_32RegClass; } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { return &AMDGPU::VReg_64RegClass; } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { @@ -388,40 +425,17 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, return SubRC->getRegister(Index + Channel); } -bool SIRegisterInfo::regClassCanUseLiteralConstant(int RCID) const { - switch (RCID) { - default: return false; - case AMDGPU::SSrc_32RegClassID: - case AMDGPU::SSrc_64RegClassID: - case AMDGPU::VSrc_32RegClassID: - case AMDGPU::VSrc_64RegClassID: - return true; - } -} - -bool SIRegisterInfo::regClassCanUseLiteralConstant( - const TargetRegisterClass *RC) const { - return regClassCanUseLiteralConstant(RC->getID()); +bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { + return OpType == AMDGPU::OPERAND_REG_IMM32; } -bool SIRegisterInfo::regClassCanUseInlineConstant(int RCID) const { - if (regClassCanUseLiteralConstant(RCID)) +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (opCanUseLiteralConstant(OpType)) return true; - switch (RCID) { - default: return false; - case AMDGPU::VCSrc_32RegClassID: - case AMDGPU::VCSrc_64RegClassID: - return true; - } -} - -bool SIRegisterInfo::regClassCanUseInlineConstant( - const TargetRegisterClass *RC) const { - return regClassCanUseInlineConstant(RC->getID()); + return OpType == AMDGPU::OPERAND_REG_INLINE_C; } - unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { @@ -434,6 +448,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::TGID_Z: return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + if (MFI->getShaderType() != ShaderType::COMPUTE) + return MFI->ScratchOffsetReg; return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); case SIRegisterInfo::SCRATCH_PTR: return AMDGPU::SGPR2_SGPR3; @@ -452,9 +468,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, /// \brief Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. -unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const { - - const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; +unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const { for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E; ++I) { @@ -464,3 +479,29 @@ unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const { return AMDGPU::NoRegister; } +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index c7e54db..d908ffd 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "llvm/Support/Debug.h" namespace llvm { @@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const override; + unsigned getRegPressureSetLimit(unsigned Idx) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; @@ -42,7 +42,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getHWRegIndex(unsigned Reg) const override; /// \brief Return the 'base' register class for this register. - /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc. + /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; /// \returns true if this class contains only SGPR registers @@ -80,22 +80,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const; - /// \returns True if operands defined with this register class can accept + /// \returns True if operands defined with this operand type can accept /// a literal constant (i.e. any 32-bit immediate). - bool regClassCanUseLiteralConstant(int RCID) const; + bool opCanUseLiteralConstant(unsigned OpType) const; - /// \returns True if operands defined with this register class can accept - /// a literal constant (i.e. any 32-bit immediate). - bool regClassCanUseLiteralConstant(const TargetRegisterClass *RC) const; - - /// \returns True if operands defined with this register class can accept + /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool regClassCanUseInlineConstant(int RCID) const; - - /// \returns True if operands defined with this register class can accept - /// a literal constant. i.e. A value in the range (-16, 64). - bool regClassCanUseInlineConstant(const TargetRegisterClass *RC) const; + bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { TGID_X, @@ -113,7 +105,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const; - unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const; + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(unsigned WaveCount) const; + + unsigned findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const; + +private: + void buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, unsigned Value, + unsigned ScratchRsrcReg, unsigned ScratchOffset, + int64_t Offset, RegScavenger *RS) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 45c2b41..8b25e95 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -21,7 +21,7 @@ def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> { +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 106; @@ -36,14 +36,14 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { let HWEncoding = 126; } -def SCC : SIReg<"SCC", 253>; -def M0 : SIReg <"M0", 124>; +def SCC : SIReg<"scc", 253>; +def M0 : SIReg <"m0", 124>; def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. // Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> { +def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 104; @@ -184,9 +184,9 @@ def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>; +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) >; @@ -197,8 +197,6 @@ def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256 def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>; - def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { @@ -211,31 +209,53 @@ def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>; +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 32; +} + +class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; +} + +class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; +} //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; +def SSrc_32 : RegImmOperand<SReg_32>; -def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>; +def SSrc_64 : RegImmOperand<SReg_64>; + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or a inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_32 : RegInlineOperand<SReg_32>; //===----------------------------------------------------------------------===// // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; +def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; + +def VSrc_32 : RegImmOperand<VS_32>; -def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +def VSrc_64 : RegImmOperand<VS_64>; //===----------------------------------------------------------------------===// // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// -def VCSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; +def VCSrc_32 : RegInlineOperand<VS_32>; -def VCSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +def VCSrc_64 : RegInlineOperand<VS_64>; //===----------------------------------------------------------------------===// // SGPR and VGPR register classes diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td index 28b65b8..9b1f676 100644 --- a/lib/Target/R600/SISchedule.td +++ b/lib/Target/R600/SISchedule.td @@ -7,9 +7,85 @@ // //===----------------------------------------------------------------------===// // -// TODO: This is just a place holder for now. +// MachineModel definitions for Southern Islands (SI) // //===----------------------------------------------------------------------===// +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; -def SI_Itin : ProcessorItineraries <[], [], []>; +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteQuarterRate32 : SchedWrite; + +def WriteFloatFMA : SchedWrite; + +def WriteDouble : SchedWrite; +def WriteDoubleAdd : SchedWrite; + +def SIFullSpeedModel : SchedMachineModel; +def SIQuarterSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, + int latency> : WriteRes<write, resources> { + let Latency = latency; +} + +class HWVALUWriteRes<SchedWrite write, int latency> : + HWWriteRes<write, [HWVALU], latency>; + + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be acurate. + +// The latency values are 1 / (operations / cycle) / 4. +multiclass SICommonWriteRes { + + def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64 + def : HWWriteRes<WriteSALU, [HWSALU], 1>; + def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? + def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + + def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<WriteQuarterRate32, 4>; +} + + +let SchedModel = SIFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 4>; +def : HWVALUWriteRes<WriteDoubleAdd, 2>; + +} // End SchedModel = SIFullSpeedModel + +let SchedModel = SIQuarterSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 16>; +def : HWVALUWriteRes<WriteDouble, 16>; +def : HWVALUWriteRes<WriteDoubleAdd, 8>; + +} // End SchedModel = SIQuarterSpeedModel diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp index 45e83f5..97bbd78 100644 --- a/lib/Target/R600/SIShrinkInstructions.cpp +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -10,6 +10,7 @@ // #include "AMDGPU.h" +#include "AMDGPUMCInstLower.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/ADT/Statistic.h" @@ -126,37 +127,32 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, TII->isVOPC(MI.getOpcode())); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); // Only one literal constant is allowed per instruction, so if src0 is a // literal constant then we can't do any folding. - if ((Src0->isImm() || Src0->isFPImm()) && TII->isLiteralConstant(*Src0)) + if (Src0.isImm() && + TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) return; - // Literal constants and SGPRs can only be used in Src0, so if Src0 is an // SGPR, we cannot commute the instruction, so we can't fold any literal // constants. - if (Src0->isReg() && !isVGPR(Src0, TRI, MRI)) + if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) return; // Try to fold Src0 - if (Src0->isReg()) { - unsigned Reg = Src0->getReg(); + if (Src0.isReg()) { + unsigned Reg = Src0.getReg(); MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { - Src0->ChangeToImmediate(MovSrc.getImm()); + Src0.ChangeToImmediate(MovSrc.getImm()); ConstantFolded = true; - } else if (MovSrc.isFPImm()) { - const ConstantFP *CFP = MovSrc.getFPImm(); - if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle) { - Src0->ChangeToFPImmediate(CFP); - ConstantFolded = true; - } } if (ConstantFolded) { if (MRI.use_empty(Reg)) @@ -193,13 +189,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Src = MI.getOperand(1); - // TODO: Handle FPImm? if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) { + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - continue; - } } + + continue; } if (!TII->hasVALU32BitEncoding(MI.getOpcode())) @@ -213,13 +208,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } - int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); - - // Op32 could be -1 here if we started with an instruction that had a + // getVOPe32 could be -1 here if we started with an instruction that had // a 32-bit encoding and then commuted it to an instruction that did not. - if (Op32 == -1) + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index 9318dc1..27bbf4f 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -61,8 +61,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - AttributeSet Set = F.getAttributes(); - Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType"); + Attribute A = F.getFnAttribute("ShaderType"); unsigned ShaderType = ShaderType::COMPUTE; if (A.isStringAttribute()) { diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp index f437564..d723d6e 100644 --- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp @@ -16,11 +16,15 @@ using namespace llvm; -/// \brief The target for the AMDGPU backend +/// \brief The target which suports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. Target llvm::TheAMDGPUTarget; +/// \brief The target for GCN GPUs +Target llvm::TheGCNTarget; /// \brief Extern function to initialize the targets for the AMDGPU backend extern "C" void LLVMInitializeR600TargetInfo() { RegisterTarget<Triple::r600, false> R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); + RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); } diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td new file mode 100644 index 0000000..d8738f9 --- /dev/null +++ b/lib/Target/R600/VIInstrFormats.td @@ -0,0 +1,166 @@ +//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// VI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class DSe_vi <bits<8> op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{16} = gds; + let Inst{24-17} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe_vi <bits<7> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{16} = lds; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe_vi <bits<4> op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{18-15} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class SMEMe_vi <bits<8> op, bit imm> : Enc64 { + bits<7> sbase; + bits<7> sdata; + bits<1> glc; + bits<20> offset; + + let Inst{5-0} = sbase{6-1}; + let Inst{12-6} = sdata; + let Inst{16} = glc; + let Inst{17} = imm; + let Inst{25-18} = op; + let Inst{31-26} = 0x30; //encoding + let Inst{51-32} = offset; +} + +class VOP3e_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be_vi <bits<10> op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class EXPe_vi : EXPe { + let Inst{31-26} = 0x31; //encoding +} + +class VINTRPe_vi <bits<2> op> : VINTRPe <op> { + let Inst{31-26} = 0x35; // encoding +} diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td new file mode 100644 index 0000000..4a6e933 --- /dev/null +++ b/lib/Target/R600/VIInstructions.td @@ -0,0 +1,25 @@ +//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for VI and newer. +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// SMEM Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isVI] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +} // End Predicates = [isVI] |