aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPU.h8
-rw-r--r--lib/Target/R600/AMDGPU.td46
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp70
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.h23
-rw-r--r--lib/Target/R600/AMDGPUConvertToISA.cpp62
-rw-r--r--lib/Target/R600/AMDGPUFrameLowering.cpp2
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp209
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp987
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h95
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.cpp28
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.h19
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.td69
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td97
-rw-r--r--lib/Target/R600/AMDGPUIntrinsicInfo.cpp (renamed from lib/Target/R600/AMDILIntrinsicInfo.cpp)40
-rw-r--r--lib/Target/R600/AMDGPUIntrinsicInfo.h (renamed from lib/Target/R600/AMDILIntrinsicInfo.h)21
-rw-r--r--lib/Target/R600/AMDGPUIntrinsics.td29
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.cpp1
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.h4
-rw-r--r--lib/Target/R600/AMDGPUPromoteAlloca.cpp387
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.cpp4
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.h14
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp100
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h100
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp13
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.h7
-rw-r--r--lib/Target/R600/AMDILBase.td25
-rw-r--r--lib/Target/R600/AMDILISelLowering.cpp560
-rw-r--r--lib/Target/R600/AMDILInstrInfo.td150
-rw-r--r--lib/Target/R600/AMDILIntrinsics.td224
-rw-r--r--lib/Target/R600/AMDILRegisterInfo.td107
-rw-r--r--lib/Target/R600/CMakeLists.txt6
-rw-r--r--lib/Target/R600/EvergreenInstructions.td11
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp15
-rw-r--r--lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp14
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp1
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp314
-rw-r--r--lib/Target/R600/R600ISelLowering.h9
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp90
-rw-r--r--lib/Target/R600/R600InstrInfo.h18
-rw-r--r--lib/Target/R600/R600Instructions.td158
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp1
-rw-r--r--lib/Target/R600/R600Packetizer.cpp1
-rw-r--r--lib/Target/R600/R600RegisterInfo.cpp17
-rw-r--r--lib/Target/R600/R600RegisterInfo.h12
-rw-r--r--lib/Target/R600/R600RegisterInfo.td48
-rw-r--r--lib/Target/R600/SIAnnotateControlFlow.cpp46
-rw-r--r--lib/Target/R600/SIDefines.h50
-rw-r--r--lib/Target/R600/SIFixSGPRLiveRanges.cpp110
-rw-r--r--lib/Target/R600/SIISelLowering.cpp303
-rw-r--r--lib/Target/R600/SIISelLowering.h10
-rw-r--r--lib/Target/R600/SIInsertWaits.cpp2
-rw-r--r--lib/Target/R600/SIInstrFormats.td26
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp223
-rw-r--r--lib/Target/R600/SIInstrInfo.h12
-rw-r--r--lib/Target/R600/SIInstrInfo.td152
-rw-r--r--lib/Target/R600/SIInstructions.td868
-rw-r--r--lib/Target/R600/SIIntrinsics.td50
-rw-r--r--lib/Target/R600/SILowerControlFlow.cpp87
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.cpp6
-rw-r--r--lib/Target/R600/SIRegisterInfo.cpp34
-rw-r--r--lib/Target/R600/SIRegisterInfo.h19
-rw-r--r--lib/Target/R600/SIRegisterInfo.td2
-rw-r--r--lib/Target/R600/SITypeRewriter.cpp3
63 files changed, 3935 insertions, 2284 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 949fdfb..713fc4b 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -17,6 +17,7 @@
namespace llvm {
class AMDGPUInstrPrinter;
+class AMDGPUSubtarget;
class AMDGPUTargetMachine;
class FunctionPass;
class MCAsmInfo;
@@ -40,6 +41,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRLiveRangesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
@@ -47,14 +49,18 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
// Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
/// \brief Creates an AMDGPU-specific Target Transformation Info pass.
ImmutablePass *
createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM);
+void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
+extern char &SIFixSGPRLiveRangesID;
+
+
extern Target TheAMDGPUTarget;
} // End namespace llvm
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 2edc115..6ff9ab7 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -7,8 +7,7 @@
//
//==-----------------------------------------------------------------------===//
-// Include AMDIL TD files
-include "AMDILBase.td"
+include "llvm/Target/Target.td"
//===----------------------------------------------------------------------===//
// Subtarget Features
@@ -33,30 +32,25 @@ def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
"false",
"Disable the if conversion pass">;
-def FeatureFP64 : SubtargetFeature<"fp64",
+def FeatureFP64 : SubtargetFeature<"fp64",
"FP64",
"true",
- "Enable 64bit double precision operations">;
+ "Enable double precision operations">;
def Feature64BitPtr : SubtargetFeature<"64BitPtr",
"Is64bit",
"true",
- "Specify if 64bit addressing should be used.">;
-
-def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
- "Is32on64bit",
- "false",
- "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+ "Specify if 64-bit addressing should be used">;
def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
"R600ALUInst",
"false",
- "Older version of ALU instructions encoding.">;
+ "Older version of ALU instructions encoding">;
def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
"HasVertexCache",
"true",
- "Specify use of dedicated vertex cache.">;
+ "Specify use of dedicated vertex cache">;
def FeatureCaymanISA : SubtargetFeature<"caymanISA",
"CaymanISA",
@@ -87,28 +81,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+ "localmemorysize"#Value,
+ "LocalMemorySize",
+ !cast<string>(Value),
+ "The size of local memory in bytes">;
+
class SubtargetFeatureGeneration <string Value,
list<SubtargetFeature> Implies> :
SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
Value#" GPU generation", Implies>;
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
def FeatureR600 : SubtargetFeatureGeneration<"R600",
- [FeatureR600ALUInst, FeatureFetchLimit8]>;
+ [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
def FeatureR700 : SubtargetFeatureGeneration<"R700",
- [FeatureFetchLimit16]>;
+ [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
- [FeatureFetchLimit16]>;
+ [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
- [FeatureFetchLimit16, FeatureWavefrontSize64]>;
+ [FeatureFetchLimit16, FeatureWavefrontSize64,
+ FeatureLocalMemorySize32768]
+>;
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
- [Feature64BitPtr, FeatureFP64]>;
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
- [Feature64BitPtr, FeatureFP64]>;
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
@@ -120,6 +126,10 @@ def AMDGPU : Target {
let InstructionSet = AMDGPUInstrInfo;
}
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
//===----------------------------------------------------------------------===//
// Predicate helper class
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 170f479..a6e217b 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -19,6 +19,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
@@ -35,6 +36,24 @@
using namespace llvm;
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+static uint32_t getFPMode(MachineFunction &) {
+ return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) |
+ FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE);
+}
static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
MCStreamer &Streamer) {
@@ -92,6 +111,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
false);
+ OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+ false);
+ OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+ false);
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer.emitRawComment(
@@ -279,16 +302,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (VCCUsed)
MaxSGPR += 2;
- ProgInfo.CodeLen = CodeSize;
- ProgInfo.NumSGPR = MaxSGPR;
ProgInfo.NumVGPR = MaxVGPR;
+ ProgInfo.NumSGPR = MaxSGPR;
+
+ // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+ // register.
+ ProgInfo.FloatMode = getFPMode(MF);
+
+ // XXX: Not quite sure what this does, but sc seems to unset this.
+ ProgInfo.IEEEMode = 0;
+
+ // Do not clamp NAN to 0.
+ ProgInfo.DX10Clamp = 0;
+
+ ProgInfo.CodeLen = CodeSize;
}
void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
const SIProgramInfo &KernelInfo) {
const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
unsigned RsrcReg;
switch (MFI->ShaderType) {
default: // Fall through
@@ -298,25 +332,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
}
- OutStreamer.EmitIntValue(RsrcReg, 4);
- OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
- S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
-
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
- // LDS is allocated in 64 dword blocks
+ // LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
} else {
- // LDS is allocated in 128 dword blocks
+ // LDS is allocated in 128 dword blocks.
LDSAlignShift = 9;
}
+
unsigned LDSBlocks =
- RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+ RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
if (MFI->ShaderType == ShaderType::COMPUTE) {
+ OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+ const uint32_t ComputePGMRSrc1 =
+ S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
+ S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
+ S_00B848_PRIORITY(KernelInfo.Priority) |
+ S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
+ S_00B848_PRIV(KernelInfo.Priv) |
+ S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
+ S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
+ S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
+
+ OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+ } else {
+ OutStreamer.EmitIntValue(RsrcReg, 4);
+ OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
+ S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
}
+
if (MFI->ShaderType == ShaderType::PIXEL) {
OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index 71adc9a..c1acb6e 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -25,13 +25,28 @@ class AMDGPUAsmPrinter : public AsmPrinter {
private:
struct SIProgramInfo {
SIProgramInfo() :
- CodeLen(0),
+ NumVGPR(0),
NumSGPR(0),
- NumVGPR(0) {}
+ Priority(0),
+ FloatMode(0),
+ Priv(0),
+ DX10Clamp(0),
+ DebugMode(0),
+ IEEEMode(0),
+ CodeLen(0) {}
+ // Fields set in PGM_RSRC1 pm4 packet.
+ uint32_t NumVGPR;
+ uint32_t NumSGPR;
+ uint32_t Priority;
+ uint32_t FloatMode;
+ uint32_t Priv;
+ uint32_t DX10Clamp;
+ uint32_t DebugMode;
+ uint32_t IEEEMode;
+
+ // Bonus information for debugging.
uint64_t CodeLen;
- unsigned NumSGPR;
- unsigned NumVGPR;
};
void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const;
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
deleted file mode 100644
index 91aeee2..0000000
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass lowers AMDIL machine instructions to the appropriate
-/// hardware instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUConvertToISAPass : public MachineFunctionPass {
-
-private:
- static char ID;
- TargetMachine &TM;
-
-public:
- AMDGPUConvertToISAPass(TargetMachine &tm) :
- MachineFunctionPass(ID), TM(tm) { }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- const char *getPassName() const override {return "AMDGPU Convert to ISA";}
-
-};
-
-} // End anonymous namespace
-
-char AMDGPUConvertToISAPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
- return new AMDGPUConvertToISAPass(tm);
-}
-
-bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
- const AMDGPUInstrInfo * TII =
- static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
-
- for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
- BB != BB_E; ++BB) {
- MachineBasicBlock &MBB = *BB;
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- MachineInstr &MI = *I;
- TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
- }
- }
- return false;
-}
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index e7e90d3..9e8302e 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -83,7 +83,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
OffsetBytes += MFI->getObjectSize(i);
- // Each regiter holds 4 bytes, so we must always align the offset to at
+ // Each register holds 4 bytes, so we must always align the offset to at
// least 4 bytes, so that 2 frame objects won't share the same register.
OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
}
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index f1f0bfa..b4d79e5 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "SIISelLowering.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -83,6 +84,11 @@ private:
SDValue& Offset);
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
+ SDValue &ImmOffset) const;
+
+ SDNode *SelectADD_SUB_I64(SDNode *N);
+ SDNode *SelectDIV_SCALE(SDNode *N);
// Include the pieces autogenerated from the target description.
#include "AMDGPUGenDAGISel.inc"
@@ -211,51 +217,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
- case ISD::ADD: {
+ case ISD::ADD:
+ case ISD::SUB: {
if (N->getValueType(0) != MVT::i64 ||
ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
- SDLoc DL(N);
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
-
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
-
- SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, LHS, Sub0);
- SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, LHS, Sub1);
-
- SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, RHS, Sub0);
- SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, RHS, Sub1);
-
- SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
-
- SmallVector<SDValue, 8> AddLoArgs;
- AddLoArgs.push_back(SDValue(Lo0, 0));
- AddLoArgs.push_back(SDValue(Lo1, 0));
-
- SDNode *AddLo = CurDAG->getMachineNode(
- isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32,
- DL, VTList, AddLoArgs);
- SDValue Carry = SDValue(AddLo, 1);
- SDNode *AddHi = CurDAG->getMachineNode(
- isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32,
- DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
-
- SDValue Args[5] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
- SDValue(AddLo,0),
- Sub0,
- SDValue(AddHi,0),
- Sub1,
- };
- return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+ return SelectADD_SUB_I64(N);
}
+ case ISD::SCALAR_TO_VECTOR:
+ case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
unsigned RegClassID;
const AMDGPURegisterInfo *TRI =
@@ -264,7 +235,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
- assert(VT.getVectorElementType().bitsEq(MVT::i32));
+ EVT EltVT = VT.getVectorElementType();
+ assert(EltVT.bitsEq(MVT::i32));
if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
bool UseVReg = true;
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
@@ -305,7 +277,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
// can't be bundled by our scheduler.
switch(NumVectorElts) {
case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
- case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+ case 4:
+ if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+ else
+ RegClassID = AMDGPU::R600_Reg128RegClassID;
+ break;
default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
}
}
@@ -313,8 +290,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
if (NumVectorElts == 1) {
- return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
- VT.getVectorElementType(),
+ return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
N->getOperand(0), RegClass);
}
@@ -323,11 +299,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
// 16 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
- SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
+ SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
bool IsRegSeq = true;
- for (unsigned i = 0; i < N->getNumOperands(); i++) {
+ unsigned NOps = N->getNumOperands();
+ for (unsigned i = 0; i < NOps; i++) {
// XXX: Why is this here?
if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
IsRegSeq = false;
@@ -337,6 +314,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
RegSeqArgs[1 + (2 * i) + 1] =
CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
}
+
+ if (NOps != NumVectorElts) {
+ // Fill in the missing undef elements if this was a scalar_to_vector.
+ assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+ MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SDLoc(N), EltVT);
+ for (unsigned i = NOps; i < NumVectorElts; ++i) {
+ RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
+ }
+ }
+
if (!IsRegSeq)
break;
return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
@@ -466,6 +457,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
PackedOffsetWidth);
}
+ case AMDGPUISD::DIV_SCALE: {
+ return SelectDIV_SCALE(N);
+ }
}
return SelectCode(N);
}
@@ -659,6 +653,129 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
return true;
}
+SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ bool IsAdd = (N->getOpcode() == ISD::ADD);
+
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+
+ SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub0);
+ SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub1);
+
+ SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub0);
+ SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub1);
+
+ SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+ SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+
+
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
+ unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+ if (!isCFDepth0()) {
+ Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32;
+ CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32;
+ }
+
+ SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
+ SDValue Carry(AddLo, 1);
+ SDNode *AddHi
+ = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
+ SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
+
+ SDValue Args[5] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+ SDValue(AddLo,0),
+ Sub0,
+ SDValue(AddHi,0),
+ Sub1,
+ };
+ return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ assert(VT == MVT::f32 || VT == MVT::f64);
+
+ unsigned Opc
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+ const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+
+ SDValue Ops[] = {
+ N->getOperand(0),
+ N->getOperand(1),
+ N->getOperand(2),
+ Zero,
+ Zero,
+ Zero,
+ Zero
+ };
+
+ return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+ return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32,
+ Ptr), 0);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
+ SDValue &Offset,
+ SDValue &ImmOffset) const {
+ SDLoc DL(Addr);
+
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+ if (isUInt<12>(C1->getZExtValue())) {
+
+ if (N0.getOpcode() == ISD::ADD) {
+ // (add (add N2, N3), C1)
+ SDValue N2 = N0.getOperand(0);
+ SDValue N3 = N0.getOperand(1);
+ Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
+ Offset = N3;
+ ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ return true;
+ }
+
+ // (add N0, C1)
+ Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
+ Offset = N0;
+ ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ return true;
+ }
+ }
+ if (Addr.getOpcode() == ISD::ADD) {
+ // (add N0, N1)
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
+ Offset = N1;
+ ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+ return true;
+ }
+
+ // default case
+ Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
+ Offset = Addr;
+ ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+ return true;
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 6c443ea..0ada7a3 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -16,9 +16,9 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
#include "R600MachineFunctionInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -84,13 +84,37 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
#include "AMDGPUGenCallingConv.inc"
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+ unsigned StoreSize = VT.getStoreSizeInBits();
+ if (StoreSize <= 32)
+ return EVT::getIntegerVT(Ctx, StoreSize);
+
+ assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+// Type for a vector that will be loaded to.
+EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+ unsigned StoreSize = VT.getStoreSizeInBits();
+ if (StoreSize <= 32)
+ return EVT::getIntegerVT(Ctx, 32);
+
+ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
TargetLowering(TM, new TargetLoweringObjectFileELF()) {
Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
- // Initialize target lowering borrowed from AMDIL
- InitAMDILLowering();
+ setOperationAction(ISD::Constant, MVT::i32, Legal);
+ setOperationAction(ISD::Constant, MVT::i64, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
// We need to custom lower some of the intrinsics
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -107,9 +131,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- // The hardware supports ROTR, but not ROTL
- setOperationAction(ISD::ROTL, MVT::i32, Expand);
-
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -118,6 +139,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v2f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -161,6 +185,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
@@ -202,29 +229,63 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+ }
- setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
- setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
- setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+ for (MVT VT : ScalarIntVTs) {
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
- setOperationAction(ISD::MUL, MVT::i64, Expand);
- setOperationAction(ISD::SUB, MVT::i64, Expand);
+ // GPU does not have divrem function for signed or unsigned.
+ setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Custom);
+
+ // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ // The hardware supports 32-bit ROTR, but not ROTL.
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+ setOperationAction(ISD::MUL, MVT::i64, Expand);
+ setOperationAction(ISD::MULHU, MVT::i64, Expand);
+ setOperationAction(ISD::MULHS, MVT::i64, Expand);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
- setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
- setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
setOperationAction(ISD::UREM, MVT::i32, Expand);
- setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- static const MVT::SimpleValueType IntTypes[] = {
+ static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
- for (MVT VT : IntTypes) {
- //Expand the following operations for the current type by default
+ for (MVT VT : VectorIntTypes) {
+ // Expand the following operations for the current type by default.
setOperationAction(ISD::ADD, VT, Expand);
setOperationAction(ISD::AND, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
@@ -232,40 +293,93 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::MUL, VT, Expand);
setOperationAction(ISD::OR, VT, Expand);
setOperationAction(ISD::SHL, VT, Expand);
- setOperationAction(ISD::SINT_TO_FP, VT, Expand);
- setOperationAction(ISD::SRL, VT, Expand);
setOperationAction(ISD::SRA, VT, Expand);
+ setOperationAction(ISD::SRL, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::SUB, VT, Expand);
- setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ // TODO: Implement custom UREM / SREM routines.
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Custom);
+ setOperationAction(ISD::ADDC, VT, Expand);
+ setOperationAction(ISD::SUBC, VT, Expand);
+ setOperationAction(ISD::ADDE, VT, Expand);
+ setOperationAction(ISD::SUBE, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::XOR, VT, Expand);
+ setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+ setOperationAction(ISD::CTLZ, VT, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
}
- static const MVT::SimpleValueType FloatTypes[] = {
+ static const MVT::SimpleValueType FloatVectorTypes[] = {
MVT::v2f32, MVT::v4f32
};
- for (MVT VT : FloatTypes) {
+ for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
+ setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
setOperationAction(ISD::FMUL, VT, Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::FSQRT, VT, Expand);
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSUB, VT, Expand);
+ setOperationAction(ISD::FNEG, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
}
+ setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT_CC);
+
+ setSchedulingPreference(Sched::RegPressure);
+ setJumpIsExpensive(true);
+
+ setSelectIsExpensive(false);
+ PredictableSelectIsExpensive = false;
+
+ // There are no integer divide instructions, and these expand to a pretty
+ // large sequence of instructions.
+ setIntDivIsCheap(false);
+ setPow2DivIsCheap(false);
+
+ // TODO: Investigate this when 64-bit divides are implemented.
+ addBypassSlowDiv(64, 32);
+
+ // FIXME: Need to really handle these.
+ MaxStoresPerMemcpy = 4096;
+ MaxStoresPerMemmove = 4096;
+ MaxStoresPerMemset = 4096;
}
//===----------------------------------------------------------------------===//
@@ -276,6 +390,23 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
return MVT::i32;
}
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+ return true;
+}
+
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ EVT ScalarVT = VT.getScalarType();
+ return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+ EVT ScalarVT = VT.getScalarType();
+ return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
EVT CastTy) const {
if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -330,6 +461,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
return Src == MVT::i32 && Dest == MVT::i64;
}
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ return isZExtFree(Val.getValueType(), VT2);
+}
+
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
@@ -383,25 +518,28 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
return SDValue();
}
-SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
- const {
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
Op.getNode()->dump();
llvm_unreachable("Custom lowering code for this"
"instruction is not implemented yet!");
break;
- // AMDIL DAG lowering
- case ISD::SDIV: return LowerSDIV(Op, DAG);
- case ISD::SREM: return LowerSREM(Op, DAG);
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
- case ISD::BRCOND: return LowerBRCOND(Op, DAG);
- // AMDGPU DAG lowering
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::SDIV: return LowerSDIV(Op, DAG);
+ case ISD::SREM: return LowerSREM(Op, DAG);
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+ case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+ case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+ case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+ case ISD::FRINT: return LowerFRINT(Op, DAG);
+ case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+ case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
}
return Op;
@@ -419,95 +557,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
// nothing here and let the illegal result integer be handled normally.
return;
- case ISD::UDIV: {
- SDValue Op = SDValue(N, 0);
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
- N->getOperand(0), N->getOperand(1));
- Results.push_back(UDIVREM);
- break;
- }
- case ISD::UREM: {
- SDValue Op = SDValue(N, 0);
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
- N->getOperand(0), N->getOperand(1));
- Results.push_back(UDIVREM.getValue(1));
- break;
- }
- case ISD::UDIVREM: {
- SDValue Op = SDValue(N, 0);
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-
- SDValue one = DAG.getConstant(1, HalfVT);
- SDValue zero = DAG.getConstant(0, HalfVT);
-
- //HiLo split
- SDValue LHS = N->getOperand(0);
- SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
- SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
-
- SDValue RHS = N->getOperand(1);
- SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
- SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
-
- // Get Speculative values
- SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
- SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
-
- SDValue REM_Hi = zero;
- SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-
- SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
- SDValue DIV_Lo = zero;
-
- const unsigned halfBitWidth = HalfVT.getSizeInBits();
-
- for (unsigned i = 0; i < halfBitWidth; ++i) {
- SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
- // Get Value of high bit
- SDValue HBit;
- if (halfBitWidth == 32 && Subtarget->hasBFE()) {
- HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
- } else {
- HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
- HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
- }
-
- SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
- DAG.getConstant(halfBitWidth - 1, HalfVT));
- REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
- REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
- REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
- REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
-
-
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
- SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
- SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
-
- DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
-
- // Update REM
-
- SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
- REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
- REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
- REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
- }
+ case ISD::LOAD: {
+ SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+ if (!Node)
+ return;
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
- SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
- Results.push_back(DIV);
- Results.push_back(REM);
- break;
+ Results.push_back(SDValue(Node, 0));
+ Results.push_back(SDValue(Node, 1));
+ // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+ // function
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+ return;
+ }
+ case ISD::STORE: {
+ SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
+ if (Lowered.getNode())
+ Results.push_back(Lowered);
+ return;
}
default:
return;
@@ -531,12 +597,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
SelectionDAG &DAG) const {
const DataLayout *TD = getTargetMachine().getDataLayout();
SDLoc DL(InitPtr);
+ Type *InitTy = Init->getType();
+
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
- EVT VT = EVT::getEVT(CI->getType());
- PointerType *PtrTy = PointerType::get(CI->getType(), 0);
- return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
- TD->getPrefTypeAlignment(CI->getType()));
+ EVT VT = EVT::getEVT(InitTy);
+ PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+ return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
+ MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+ TD->getPrefTypeAlignment(InitTy));
}
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
@@ -547,7 +615,6 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
TD->getPrefTypeAlignment(CFP->getType()));
}
- Type *InitTy = Init->getType();
if (StructType *ST = dyn_cast<StructType>(InitTy)) {
const StructLayout *SL = TD->getStructLayout(ST);
@@ -589,6 +656,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
+ if (isa<UndefValue>(Init)) {
+ EVT VT = EVT::getEVT(InitTy);
+ PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+ return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
+ MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+ TD->getPrefTypeAlignment(InitTy));
+ }
+
Init->dump();
llvm_unreachable("Unhandled constant initializer");
}
@@ -628,11 +703,19 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
unsigned Size = TD->getTypeAllocSize(EltType);
unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+ MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
+ MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+
+ int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
+ SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
+
const GlobalVariable *Var = cast<GlobalVariable>(GV);
+ if (!Var->hasInitializer()) {
+ // This has no use, but bugpoint will hit it.
+ return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+ }
+
const Constant *Init = Var->getInitializer();
- int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
- SDValue InitPtr = DAG.getFrameIndex(FI,
- getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
SmallVector<SDNode*, 8> WorkList;
for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
@@ -651,8 +734,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
}
DAG.UpdateNodeOperands(*I, Ops);
}
- return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
- getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+ return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
}
}
}
@@ -688,8 +770,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
const AMDGPUFrameLowering *TFL =
static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
- FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
- assert(FIN);
+ FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FIN->getIndex();
unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
@@ -705,26 +786,66 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
default: return Op;
- case AMDGPUIntrinsic::AMDIL_abs:
+ case AMDGPUIntrinsic::AMDGPU_abs:
+ case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
return LowerIntrinsicIABS(Op, DAG);
- case AMDGPUIntrinsic::AMDIL_exp:
- return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
case AMDGPUIntrinsic::AMDGPU_lrp:
return LowerIntrinsicLRP(Op, DAG);
- case AMDGPUIntrinsic::AMDIL_fraction:
+ case AMDGPUIntrinsic::AMDGPU_fract:
+ case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
- case AMDGPUIntrinsic::AMDIL_max:
- return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
+
+ case AMDGPUIntrinsic::AMDGPU_clamp:
+ case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+ return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::AMDGPU_div_scale: {
+ // 3rd parameter required to be a constant.
+ const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ if (!Param)
+ return DAG.getUNDEF(VT);
+
+ // Translate to the operands expected by the machine instruction. The
+ // first parameter must be the same as the first instruction.
+ SDValue Numerator = Op.getOperand(1);
+ SDValue Denominator = Op.getOperand(2);
+ SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+ return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
+ Src0, Denominator, Numerator);
+ }
+
+ case Intrinsic::AMDGPU_div_fmas:
+ return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::AMDGPU_div_fixup:
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::AMDGPU_trig_preop:
+ return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::AMDGPU_rcp:
+ return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::AMDGPU_rsq:
+ return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+ case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
+ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::AMDGPU_rsq_clamped:
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+
case AMDGPUIntrinsic::AMDGPU_imax:
return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
Op.getOperand(2));
case AMDGPUIntrinsic::AMDGPU_umax:
return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
Op.getOperand(2));
- case AMDGPUIntrinsic::AMDIL_min:
- return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
case AMDGPUIntrinsic::AMDGPU_imin:
return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
Op.getOperand(2));
@@ -748,6 +869,18 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
+
+ case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
+
+ case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
+
+ case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
+
case AMDGPUIntrinsic::AMDGPU_bfe_i32:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1),
@@ -771,8 +904,16 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1),
Op.getOperand(2));
- case AMDGPUIntrinsic::AMDIL_round_nearest:
+ case AMDGPUIntrinsic::AMDGPU_brev:
+ return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+
+ case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
+ return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+
+ case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+ case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
+ return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
}
}
@@ -863,27 +1004,41 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+ EVT LoadVT = Op.getValueType();
EVT EltVT = Op.getValueType().getVectorElementType();
EVT PtrVT = Load->getBasePtr().getValueType();
+
unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
SmallVector<SDValue, 8> Loads;
+ SmallVector<SDValue, 8> Chains;
+
SDLoc SL(Op);
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
- Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
- Load->getChain(), Ptr,
- MachinePointerInfo(Load->getMemOperand()->getValue()),
- MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->getAlignment()));
+
+ SDValue NewLoad
+ = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+ Load->getChain(), Ptr,
+ MachinePointerInfo(Load->getMemOperand()->getValue()),
+ MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+ Load->getAlignment());
+ Loads.push_back(NewLoad.getValue(0));
+ Chains.push_back(NewLoad.getValue(1));
}
- return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
+ DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
+ };
+
+ return DAG.getMergeValues(Ops, SL);
}
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
SelectionDAG &DAG) const {
- StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+ StoreSDNode *Store = cast<StoreSDNode>(Op);
EVT MemVT = Store->getMemoryVT();
unsigned MemBits = MemVT.getSizeInBits();
@@ -981,7 +1136,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
Load->getBasePtr(),
MemVT,
Load->getMemOperand());
- return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
+ ExtLoad32.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
@@ -995,7 +1156,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, MVT::i8, MMO);
- return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
// Lower loads constant address space global variable loads
@@ -1003,11 +1170,12 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
isa<GlobalVariable>(
GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
+
SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
DAG.getConstant(2, MVT::i32));
- return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
Load->getChain(), Ptr,
DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
}
@@ -1034,10 +1202,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemEltVT = MemVT.getScalarType();
if (ExtType == ISD::SEXTLOAD) {
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
- return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+ Load->getChain()
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
- return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
+ SDValue Ops[] = {
+ DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+ Load->getChain()
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1097,6 +1276,251 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT OVT = Op.getValueType();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ MVT INTTY;
+ MVT FLTTY;
+ if (!OVT.isVector()) {
+ INTTY = MVT::i32;
+ FLTTY = MVT::f32;
+ } else if (OVT.getVectorNumElements() == 2) {
+ INTTY = MVT::v2i32;
+ FLTTY = MVT::v2f32;
+ } else if (OVT.getVectorNumElements() == 4) {
+ INTTY = MVT::v4i32;
+ FLTTY = MVT::v4f32;
+ }
+ unsigned bitsize = OVT.getScalarType().getSizeInBits();
+ // char|short jq = ia ^ ib;
+ SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+ // jq = jq >> (bitsize - 2)
+ jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
+
+ // jq = jq | 0x1
+ jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+ // jq = (int)jq
+ jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+ // int ia = (int)LHS;
+ SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+ // int ib, (int)RHS;
+ SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+ // float fa = (float)ia;
+ SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+ // float fb = (float)ib;
+ SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+ // float fq = native_divide(fa, fb);
+ SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
+ fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
+
+ // fq = trunc(fq);
+ fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+ // float fqneg = -fq;
+ SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+ // float fr = mad(fqneg, fb, fa);
+ SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
+ DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+
+ // int iq = (int)fq;
+ SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+ // fr = fabs(fr);
+ fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+ // fb = fabs(fb);
+ fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+ // int cv = fr >= fb;
+ SDValue cv;
+ if (INTTY == MVT::i32) {
+ cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+ } else {
+ cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+ }
+ // jq = (cv ? jq : 0);
+ jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
+ DAG.getConstant(0, OVT));
+ // dst = iq + jq;
+ iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+ iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+ return iq;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT OVT = Op.getValueType();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ // The LowerSDIV32 function generates equivalent to the following IL.
+ // mov r0, LHS
+ // mov r1, RHS
+ // ilt r10, r0, 0
+ // ilt r11, r1, 0
+ // iadd r0, r0, r10
+ // iadd r1, r1, r11
+ // ixor r0, r0, r10
+ // ixor r1, r1, r11
+ // udiv r0, r0, r1
+ // ixor r10, r10, r11
+ // iadd r0, r0, r10
+ // ixor DST, r0, r10
+
+ // mov r0, LHS
+ SDValue r0 = LHS;
+
+ // mov r1, RHS
+ SDValue r1 = RHS;
+
+ // ilt r10, r0, 0
+ SDValue r10 = DAG.getSelectCC(DL,
+ r0, DAG.getConstant(0, OVT),
+ DAG.getConstant(-1, OVT),
+ DAG.getConstant(0, OVT),
+ ISD::SETLT);
+
+ // ilt r11, r1, 0
+ SDValue r11 = DAG.getSelectCC(DL,
+ r1, DAG.getConstant(0, OVT),
+ DAG.getConstant(-1, OVT),
+ DAG.getConstant(0, OVT),
+ ISD::SETLT);
+
+ // iadd r0, r0, r10
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+ // iadd r1, r1, r11
+ r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+ // ixor r0, r0, r10
+ r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+ // ixor r1, r1, r11
+ r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+ // udiv r0, r0, r1
+ r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+ // ixor r10, r10, r11
+ r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+ // iadd r0, r0, r10
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+ // ixor DST, r0, r10
+ SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+ return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
+ return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+ EVT OVT = Op.getValueType().getScalarType();
+
+ if (OVT == MVT::i64)
+ return LowerSDIV64(Op, DAG);
+
+ if (OVT.getScalarType() == MVT::i32)
+ return LowerSDIV32(Op, DAG);
+
+ if (OVT == MVT::i16 || OVT == MVT::i8) {
+ // FIXME: We should be checking for the masked bits. This isn't reached
+ // because i8 and i16 are not legal types.
+ return LowerSDIV24(Op, DAG);
+ }
+
+ return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT OVT = Op.getValueType();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ // The LowerSREM32 function generates equivalent to the following IL.
+ // mov r0, LHS
+ // mov r1, RHS
+ // ilt r10, r0, 0
+ // ilt r11, r1, 0
+ // iadd r0, r0, r10
+ // iadd r1, r1, r11
+ // ixor r0, r0, r10
+ // ixor r1, r1, r11
+ // udiv r20, r0, r1
+ // umul r20, r20, r1
+ // sub r0, r0, r20
+ // iadd r0, r0, r10
+ // ixor DST, r0, r10
+
+ // mov r0, LHS
+ SDValue r0 = LHS;
+
+ // mov r1, RHS
+ SDValue r1 = RHS;
+
+ // ilt r10, r0, 0
+ SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+
+ // ilt r11, r1, 0
+ SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+
+ // iadd r0, r0, r10
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+ // iadd r1, r1, r11
+ r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+ // ixor r0, r0, r10
+ r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+ // ixor r1, r1, r11
+ r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+ // udiv r20, r0, r1
+ SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+ // umul r20, r20, r1
+ r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+
+ // sub r0, r0, r20
+ r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+ // iadd r0, r0, r10
+ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+ // ixor DST, r0, r10
+ SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+ return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
+ return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
+ EVT OVT = Op.getValueType();
+
+ if (OVT.getScalarType() == MVT::i64)
+ return LowerSREM64(Op, DAG);
+
+ if (OVT.getScalarType() == MVT::i32)
+ return LowerSREM32(Op, DAG);
+
+ return SDValue(Op.getNode(), 0);
+}
+
SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -1201,6 +1625,177 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
return DAG.getMergeValues(Ops, DL);
}
+SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue NegOne = DAG.getConstant(-1, VT);
+
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
+ SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
+ SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
+ SDValue RSign = LHSign; // Remainder sign is the same as LHS
+
+ LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
+ RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
+
+ LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
+ RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
+
+ SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
+ SDValue Rem = Div.getValue(1);
+
+ Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
+ Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
+
+ Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
+ Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
+
+ SDValue Res[2] = {
+ Div,
+ Rem
+ };
+ return DAG.getMergeValues(Res, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ // result = trunc(src)
+ // if (src > 0.0 && src != result)
+ // result += 1.0
+
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+ const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+ SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+ SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+ SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+ SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ assert(Op.getValueType() == MVT::f64);
+
+ const SDValue Zero = DAG.getConstant(0, MVT::i32);
+ const SDValue One = DAG.getConstant(1, MVT::i32);
+
+ SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+ // Extract the upper half, since this is where we will find the sign and
+ // exponent.
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+ const unsigned FractBits = 52;
+ const unsigned ExpBits = 11;
+
+ // Extract the exponent.
+ SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
+ Hi,
+ DAG.getConstant(FractBits - 32, MVT::i32),
+ DAG.getConstant(ExpBits, MVT::i32));
+ SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+ DAG.getConstant(1023, MVT::i32));
+
+ // Extract the sign bit.
+ const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
+ SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+ // Extend back to to 64-bits.
+ SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ Zero, SignBit);
+ SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+ SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+ const SDValue FractMask
+ = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
+
+ SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+ SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+ SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+ const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
+
+ SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+ SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+ SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+ SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+ return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ assert(Op.getValueType() == MVT::f64);
+
+ APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+ SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
+ SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+ SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+ SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+ SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+ APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+ SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
+
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+ SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+ return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+ // FNEARBYINT and FRINT are the same, except in their handling of FP
+ // exceptions. Those aren't really meaningful for us, and OpenCL only has
+ // rint, so just treat them as equivalent.
+ return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ // result = trunc(src);
+ // if (src < 0.0 && src != result)
+ // result += -1.0.
+
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+ const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
+
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+ SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+ SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+ SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+ SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue S0 = Op.getOperand(0);
@@ -1218,7 +1813,6 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
-
}
SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
@@ -1303,6 +1897,37 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
return DAG.getConstant(Src0 >> Offset, MVT::i32);
}
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+
+ if (VT.isVector() || VT.getSizeInBits() > 32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mul;
+
+ if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+ N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+ Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+ } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+ N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+ N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+ Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+ } else {
+ return SDValue();
+ }
+
+ // We need to use sext even for MUL_U24, because MUL_U24 is used
+ // for signed multiply of 8 and 16-bit types.
+ return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -1310,34 +1935,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
switch(N->getOpcode()) {
default: break;
- case ISD::MUL: {
- EVT VT = N->getValueType(0);
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDValue Mul;
-
- // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
- if (VT.isVector() || VT.getSizeInBits() > 32)
- break;
-
- if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
- N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
- N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
- Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
- } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
- N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
- N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
- Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
- } else {
- break;
- }
-
- // We need to use sext even for MUL_U24, because MUL_U24 is used
- // for signed multiply of 8 and 16-bit types.
- SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
-
- return Reg;
- }
+ case ISD::MUL:
+ return performMulCombine(N, DCI);
case AMDGPUISD::MUL_I24:
case AMDGPUISD::MUL_U24: {
SDValue N0 = N->getOperand(0);
@@ -1511,29 +2110,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
// AMDIL DAG nodes
NODE_NAME_CASE(CALL);
NODE_NAME_CASE(UMUL);
- NODE_NAME_CASE(DIV_INF);
NODE_NAME_CASE(RET_FLAG);
NODE_NAME_CASE(BRANCH_COND);
// AMDGPU DAG nodes
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
+ NODE_NAME_CASE(CLAMP)
NODE_NAME_CASE(FMAX)
NODE_NAME_CASE(SMAX)
NODE_NAME_CASE(UMAX)
NODE_NAME_CASE(FMIN)
NODE_NAME_CASE(SMIN)
NODE_NAME_CASE(UMIN)
+ NODE_NAME_CASE(URECIP)
+ NODE_NAME_CASE(DIV_SCALE)
+ NODE_NAME_CASE(DIV_FMAS)
+ NODE_NAME_CASE(DIV_FIXUP)
+ NODE_NAME_CASE(TRIG_PREOP)
+ NODE_NAME_CASE(RCP)
+ NODE_NAME_CASE(RSQ)
+ NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(RSQ_CLAMPED)
+ NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(BFE_U32)
NODE_NAME_CASE(BFE_I32)
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
+ NODE_NAME_CASE(BREV)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)
- NODE_NAME_CASE(URECIP)
- NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
@@ -1544,6 +2152,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SAMPLEB)
NODE_NAME_CASE(SAMPLED)
NODE_NAME_CASE(SAMPLEL)
+ NODE_NAME_CASE(CVT_F32_UBYTE0)
+ NODE_NAME_CASE(CVT_F32_UBYTE1)
+ NODE_NAME_CASE(CVT_F32_UBYTE2)
+ NODE_NAME_CASE(CVT_F32_UBYTE3)
+ NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
}
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index d5d821d..98a92ad 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -42,10 +42,33 @@ private:
SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
/// \brief Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
+
+ SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
+ unsigned BitsDiff,
+ SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
protected:
+ static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+ static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
@@ -61,6 +84,7 @@ protected:
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
@@ -87,10 +111,16 @@ public:
bool isZExtFree(Type *Src, Type *Dest) const override;
bool isZExtFree(EVT Src, EVT Dest) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
MVT getVectorIdxTy() const override;
+ bool isSelectSupported(SelectSupportKind) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool ShouldShrinkFPConstant(EVT VT) const override;
+
bool isLoadBitCastBeneficial(EVT, EVT) const override;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -101,6 +131,7 @@ public:
SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
void ReplaceNodeResults(SDNode * N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
@@ -128,38 +159,6 @@ public:
SDValue Op,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
-
-// Functions defined in AMDILISelLowering.cpp
-public:
- bool getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &I, unsigned Intrinsic) const override;
-
- /// We want to mark f32/f64 floating point values as legal.
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
- /// We don't want to shrink f64/f32 constants.
- bool ShouldShrinkFPConstant(EVT VT) const override;
-
- SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-private:
- void InitAMDILLowering();
- SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
- unsigned BitsDiff,
- SelectionDAG &DAG) const;
- SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
- EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
- SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
};
namespace AMDGPUISD {
@@ -169,12 +168,15 @@ enum {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
CALL, // Function call based on a single integer
UMUL, // 32bit unsigned multiplication
- DIV_INF, // Divide with infinity returned on zero divisor
RET_FLAG,
BRANCH_COND,
// End AMDIL ISD Opcodes
DWORDADDR,
FRACT,
+ CLAMP,
+
+ // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+ // Denormals handled on some parts.
COS_HW,
SIN_HW,
FMAX,
@@ -184,11 +186,23 @@ enum {
SMIN,
UMIN,
URECIP,
+ DIV_SCALE,
+ DIV_FMAS,
+ DIV_FIXUP,
+ TRIG_PREOP, // 1 ULP max error for f64
+
+ // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
+ // For f64, max error 2^29 ULP, handles denormals.
+ RCP,
+ RSQ,
+ RSQ_LEGACY,
+ RSQ_CLAMPED,
DOT4,
BFE_U32, // Extract range of bits with zero extension to 32-bits.
BFE_I32, // Extract range of bits with sign extension to 32-bits.
BFI, // (src0 & src1) | (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.
+ BREV, // Reverse bits.
MUL_U24,
MUL_I24,
MAD_U24,
@@ -203,6 +217,21 @@ enum {
SAMPLEB,
SAMPLED,
SAMPLEL,
+
+ // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+ CVT_F32_UBYTE0,
+ CVT_F32_UBYTE1,
+ CVT_F32_UBYTE2,
+ CVT_F32_UBYTE3,
+ /// This node is for VLIW targets and it is used to represent a vector
+ /// that is stored in consecutive registers with the same channel.
+ /// For example:
+ /// |X |Y|Z|W|
+ /// T0|v.x| | | |
+ /// T1|v.y| | | |
+ /// T2|v.z| | | |
+ /// T3|v.w| | | |
+ BUILD_VERTICAL_VECTOR,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
STORE_MSKOR,
LOAD_CONSTANT,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 1c3361a..fef5b8c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -30,8 +30,8 @@ using namespace llvm;
// Pin the vtable to this file.
void AMDGPUInstrInfo::anchor() {}
-AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
- : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { }
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
+ : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { }
const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
return RI;
@@ -320,33 +320,11 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
return -1;
}
- Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+ Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
return getIndirectIndexBegin(MF) + Offset;
}
-
-void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
- DebugLoc DL) const {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const AMDGPURegisterInfo & RI = getRegisterInfo();
-
- for (unsigned i = 0; i < MI.getNumOperands(); i++) {
- MachineOperand &MO = MI.getOperand(i);
- // Convert dst regclass to one that is supported by the ISA
- if (MO.isReg() && MO.isDef()) {
- if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
- const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
- const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
-
- assert(newRegClass);
-
- MRI.setRegClass(MO.getReg(), newRegClass);
- }
- }
- }
-}
-
int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
switch (Channels) {
default: return Opcode;
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 74baf6b..95dc8c1 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -33,7 +33,7 @@
namespace llvm {
-class AMDGPUTargetMachine;
+class AMDGPUSubtarget;
class MachineFunction;
class MachineInstr;
class MachineInstrBuilder;
@@ -45,9 +45,9 @@ private:
MachineBasicBlock &MBB) const;
virtual void anchor();
protected:
- TargetMachine &TM;
+ const AMDGPUSubtarget &ST;
public:
- explicit AMDGPUInstrInfo(TargetMachine &tm);
+ explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
@@ -137,14 +137,6 @@ public:
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
// Helper functions that check the opcode for status information
- bool isLoadInst(llvm::MachineInstr *MI) const;
- bool isExtLoadInst(llvm::MachineInstr *MI) const;
- bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
- bool isSExtLoadInst(llvm::MachineInstr *MI) const;
- bool isZExtLoadInst(llvm::MachineInstr *MI) const;
- bool isAExtLoadInst(llvm::MachineInstr *MI) const;
- bool isStoreInst(llvm::MachineInstr *MI) const;
- bool isTruncStoreInst(llvm::MachineInstr *MI) const;
bool isRegisterStore(const MachineInstr &MI) const;
bool isRegisterLoad(const MachineInstr &MI) const;
@@ -185,11 +177,6 @@ public:
unsigned ValueReg, unsigned Address,
unsigned OffsetReg) const = 0;
-
- /// \brief Convert the AMDIL MachineInstr to a supported ISA
- /// MachineInstr
- void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
-
/// \brief Build a MOV instruction.
virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index f96dbb4..934d59d 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
]>;
+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
+ [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
+>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -29,11 +37,25 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
// out = a - floor(a)
def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+// out = 1.0 / a
+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+
// out = max(a, b) a and b are floats
def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]
>;
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
// out = max(a, b) a and b are signed ints
def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]
@@ -59,12 +81,38 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]
>;
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+ SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+ SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+ SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+ SDTIntToFPOp, []>;
+
+
// urecip - This operation is a helper for integer division, it returns the
// result of 1 / a as a fractional unsigned integer.
// out = (2^32 / a) + e
// e is rounding error
def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+// Special case divide preop and flags.
+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
+
+// Special case divide FMA with scale and flags (src0 = Quotient,
+// src1 = Denominator, src2 = Numerator).
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
+
+// Single or double precision division fixup.
+// Special case divide fixup and flags(src0 = Quotient, src1 =
+// Denominator, src2 = Numerator).
+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+
+// Look Up 2.0 / pi src0 with segment select src1[4:0]
+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
+
def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
[SDNPHasChain, SDNPMayLoad]>;
@@ -92,6 +140,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
+
// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
// performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
@@ -107,3 +157,22 @@ def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
[]
>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+ SDTCisVT<0, OtherVT>
+ ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 80bdf5b..b86b781 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -49,6 +49,11 @@ def u8imm : Operand<i8> {
let PrintMethod = "printU8ImmOperand";
}
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget : Operand<OtherVT>;
+
//===----------------------------------------------------------------------===//
// PatLeafs for floating-point comparisons
//===----------------------------------------------------------------------===//
@@ -127,6 +132,21 @@ def COND_NULL : PatLeaf <
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
LoadSDNode *L = cast<LoadSDNode>(N);
return L->getExtensionType() == ISD::ZEXTLOAD ||
@@ -232,26 +252,55 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return isLocalLoad(dyn_cast<LoadSDNode>(N));
}]>;
-def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
- (atomic_load_add node:$ptr, node:$value), [{
- return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value),
- (atomic_load_sub node:$ptr, node:$value), [{
- return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+class local_binary_atomic_op<SDNode atomic_op> :
+ PatFrag<(ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value), [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
def mskor_global : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
+def atomic_cmp_swap_32_local :
+ PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getMemoryVT() == MVT::i32 &&
+ AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_cmp_swap_64_local :
+ PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+ (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getMemoryVT() == MVT::i64 &&
+ AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
+int FP32_NEG_ONE = 0xbf800000;
+int FP32_ONE = 0x3f800000;
}
def CONST : Constants;
@@ -273,7 +322,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
(ins rc:$src0),
"CLAMP $dst, $src0",
- [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+ [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
>;
class FABS <RegisterClass rc> : AMDGPUShaderInst <
@@ -363,7 +412,7 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
// BFI_INT patterns
-multiclass BFIPatterns <Instruction BFI_INT> {
+multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
// Definition from ISA doc:
// (y & x) | (z & ~x)
@@ -379,6 +428,19 @@ multiclass BFIPatterns <Instruction BFI_INT> {
(BFI_INT $x, $y, $z)
>;
+ def : Pat <
+ (fcopysign f32:$src0, f32:$src1),
+ (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+ >;
+
+ def : Pat <
+ (f64 (fcopysign f64:$src0, f64:$src1)),
+ (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0),
+ (BFI_INT (LoadImm32 0x7fffffff),
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+ >;
}
// SHA-256 Ma patterns
@@ -457,6 +519,23 @@ multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
>;
}
+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+ (fdiv FP_ONE, vt:$src),
+ (RcpInst $src)
+>;
+
+multiclass RsqPat<Instruction RsqInst, ValueType vt> {
+ def : Pat <
+ (fdiv FP_ONE, (fsqrt vt:$src)),
+ (RsqInst $src)
+ >;
+
+ def : Pat <
+ (AMDGPUrcp (fsqrt vt:$src)),
+ (RsqInst $src)
+ >;
+}
+
include "R600Instructions.td"
include "R700Instructions.td"
include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
index fab4a3b..58916a9 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
@@ -1,4 +1,4 @@
-//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,7 +12,7 @@
//
//===-----------------------------------------------------------------------===//
-#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Intrinsics.h"
@@ -24,14 +24,12 @@ using namespace llvm;
#include "AMDGPUGenIntrinsics.inc"
#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
- : TargetIntrinsicInfo() {
-}
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
+ : TargetIntrinsicInfo() {}
-std::string
-AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
- unsigned int numTys) const {
- static const char* const names[] = {
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+ unsigned numTys) const {
+ static const char *const names[] = {
#define GET_INTRINSIC_NAME_TABLE
#include "AMDGPUGenIntrinsics.inc"
#undef GET_INTRINSIC_NAME_TABLE
@@ -40,23 +38,23 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
if (IntrID < Intrinsic::num_intrinsics) {
return nullptr;
}
- assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
- && "Invalid intrinsic ID");
+ assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+ "Invalid intrinsic ID");
std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
return Result;
}
-unsigned int
-AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const {
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
+ unsigned Len) const {
if (!StringRef(Name, Len).startswith("llvm."))
return 0; // All intrinsics start with 'llvm.'
#define GET_FUNCTION_RECOGNIZER
#include "AMDGPUGenIntrinsics.inc"
#undef GET_FUNCTION_RECOGNIZER
- AMDGPUIntrinsic::ID IntrinsicID
- = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+ AMDGPUIntrinsic::ID IntrinsicID =
+ (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
@@ -65,17 +63,15 @@ AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const {
return 0;
}
-bool
-AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
- // Overload Table
+bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
+// Overload Table
#define GET_INTRINSIC_OVERLOAD_TABLE
#include "AMDGPUGenIntrinsics.inc"
#undef GET_INTRINSIC_OVERLOAD_TABLE
}
-Function*
-AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
- Type **Tys,
- unsigned numTys) const {
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+ Type **Tys,
+ unsigned numTys) const {
llvm_unreachable("Not implemented");
}
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h
index 924275a..5be68a2 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.h
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h
@@ -1,4 +1,4 @@
-//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,8 +11,8 @@
/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
//
//===-----------------------------------------------------------------------===//
-#ifndef AMDIL_INTRINSICS_H
-#define AMDIL_INTRINSICS_H
+#ifndef AMDGPU_INTRINSICINFO_H
+#define AMDGPU_INTRINSICINFO_H
#include "llvm/IR/Intrinsics.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
@@ -34,16 +34,15 @@ enum ID {
class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
public:
AMDGPUIntrinsicInfo(TargetMachine *tm);
- std::string getName(unsigned int IntrId, Type **Tys = nullptr,
- unsigned int numTys = 0) const override;
- unsigned int lookupName(const char *Name, unsigned int Len) const override;
- bool isOverloaded(unsigned int IID) const override;
- Function *getDeclaration(Module *M, unsigned int ID,
+ std::string getName(unsigned IntrId, Type **Tys = nullptr,
+ unsigned numTys = 0) const override;
+ unsigned lookupName(const char *Name, unsigned Len) const override;
+ bool isOverloaded(unsigned IID) const override;
+ Function *getDeclaration(Module *M, unsigned ID,
Type **Tys = nullptr,
- unsigned int numTys = 0) const override;
+ unsigned numTys = 0) const override;
};
} // end namespace llvm
-#endif // AMDIL_INTRINSICS_H
-
+#endif // AMDGPU_INTRINSICINFO_H
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 9ad5e72..d934676 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -18,18 +18,26 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
-
+ def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+ def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+ def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+ // This is named backwards (instead of rsq_legacy) so we don't have
+ // to define it with the public builtins intrinsics. This is a
+ // workaround for how intrinsic names are parsed. If the name is
+ // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
+ // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
+ def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
def int_AMDGPU_kilp : Intrinsic<[], [], []>;
def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
- def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
- def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
@@ -53,12 +61,27 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_barrier_local : Intrinsic<[], [], []>;
+ def int_AMDGPU_barrier_global : Intrinsic<[], [], []>;
+}
+
+// Legacy names for compatibility.
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+ def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+ def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+ def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+ def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
}
let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index b759495..ac82e88 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUMCInstLower.h"
#include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
#include "R600InstrInfo.h"
#include "SIInstrInfo.h"
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 2b7f1e3..58fe34d 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -14,9 +14,9 @@
namespace llvm {
class AMDGPUSubtarget;
-class MCInst;
-class MCContext;
class MachineInstr;
+class MCContext;
+class MCInst;
class AMDGPUMCInstLower {
diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 0000000..218750d
--- /dev/null
+++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,387 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteAlloca : public FunctionPass,
+ public InstVisitor<AMDGPUPromoteAlloca> {
+
+ static char ID;
+ Module *Mod;
+ const AMDGPUSubtarget &ST;
+ int LocalMemAvailable;
+
+public:
+ AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
+ LocalMemAvailable(0) { }
+ virtual bool doInitialization(Module &M);
+ virtual bool runOnFunction(Function &F);
+ virtual const char *getPassName() const {
+ return "AMDGPU Promote Alloca";
+ }
+ void visitAlloca(AllocaInst &I);
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+ Mod = &M;
+ return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+
+ const FunctionType *FTy = F.getFunctionType();
+
+ LocalMemAvailable = ST.getLocalMemorySize();
+
+
+ // If the function has any arguments in the local address space, then it's
+ // possible these arguments require the entire local memory space, so
+ // we cannot use local memory in the pass.
+ for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+ const Type *ParamTy = FTy->getParamType(i);
+ if (ParamTy->isPointerTy() &&
+ ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LocalMemAvailable = 0;
+ DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
+ break;
+ }
+ }
+
+ if (LocalMemAvailable > 0) {
+ // Check how much local memory is being used by global objects
+ for (Module::global_iterator I = Mod->global_begin(),
+ E = Mod->global_end(); I != E; ++I) {
+ GlobalVariable *GV = I;
+ PointerType *GVTy = GV->getType();
+ if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ for (Value::use_iterator U = GV->use_begin(),
+ UE = GV->use_end(); U != UE; ++U) {
+ Instruction *Use = dyn_cast<Instruction>(*U);
+ if (!Use)
+ continue;
+ if (Use->getParent()->getParent() == &F)
+ LocalMemAvailable -=
+ Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
+ }
+ }
+ }
+
+ LocalMemAvailable = std::max(0, LocalMemAvailable);
+ DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+
+ visit(F);
+
+ return false;
+}
+
+static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+ return VectorType::get(ArrayTy->getArrayElementType(),
+ ArrayTy->getArrayNumElements());
+}
+
+static Value* calculateVectorIndex(Value *Ptr,
+ std::map<GetElementPtrInst*, Value*> GEPIdx) {
+ if (isa<AllocaInst>(Ptr))
+ return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+ return GEPIdx[GEP];
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+ // FIXME we only support simple cases
+ if (GEP->getNumOperands() != 3)
+ return NULL;
+
+ ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+ if (!I0 || !I0->isZero())
+ return NULL;
+
+ return GEP->getOperand(2);
+}
+
+// Not an instruction handled below to turn into a vector.
+//
+// TODO: Check isTriviallyVectorizable for calls and handle other
+// instructions.
+static bool canVectorizeInst(Instruction *Inst) {
+ switch (Inst->getOpcode()) {
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+ Type *AllocaTy = Alloca->getAllocatedType();
+
+ DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+
+ // FIXME: There is no reason why we can't support larger arrays, we
+ // are just being conservative for now.
+ if (!AllocaTy->isArrayTy() ||
+ AllocaTy->getArrayElementType()->isVectorTy() ||
+ AllocaTy->getArrayNumElements() > 4) {
+
+ DEBUG(dbgs() << " Cannot convert type to vector");
+ return false;
+ }
+
+ std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+ std::vector<Value*> WorkList;
+ for (User *AllocaUser : Alloca->users()) {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+ if (!GEP) {
+ if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
+ return false;
+
+ WorkList.push_back(AllocaUser);
+ continue;
+ }
+
+ Value *Index = GEPToVectorIndex(GEP);
+
+ // If we can't compute a vector index from this GEP, then we can't
+ // promote this alloca to vector.
+ if (!Index) {
+ DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
+ return false;
+ }
+
+ GEPVectorIdx[GEP] = Index;
+ for (User *GEPUser : AllocaUser->users()) {
+ if (!canVectorizeInst(cast<Instruction>(GEPUser)))
+ return false;
+
+ WorkList.push_back(GEPUser);
+ }
+ }
+
+ VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+ DEBUG(dbgs() << " Converting alloca to vector "
+ << *AllocaTy << " -> " << *VectorTy << '\n');
+
+ for (std::vector<Value*>::iterator I = WorkList.begin(),
+ E = WorkList.end(); I != E; ++I) {
+ Instruction *Inst = cast<Instruction>(*I);
+ IRBuilder<> Builder(Inst);
+ switch (Inst->getOpcode()) {
+ case Instruction::Load: {
+ Value *Ptr = Inst->getOperand(0);
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+ Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+ Inst->replaceAllUsesWith(ExtractElement);
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::Store: {
+ Value *Ptr = Inst->getOperand(1);
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+ Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+ Inst->getOperand(0),
+ Index);
+ Builder.CreateStore(NewVecValue, BitCast);
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ break;
+
+ default:
+ Inst->dump();
+ llvm_unreachable("Inconsistency in instructions promotable to vector");
+ }
+ }
+ return true;
+}
+
+static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+ for (User *User : Val->users()) {
+ if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+ continue;
+ if (isa<CallInst>(User)) {
+ WorkList.push_back(User);
+ continue;
+ }
+ if (!User->getType()->isPointerTy())
+ continue;
+ WorkList.push_back(User);
+ collectUsesWithPtrTypes(User, WorkList);
+ }
+}
+
+void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+ IRBuilder<> Builder(&I);
+
+ // First try to replace the alloca with a vector
+ Type *AllocaTy = I.getAllocatedType();
+
+ DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+ if (tryPromoteAllocaToVector(&I))
+ return;
+
+ DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+
+ // FIXME: This is the maximum work group size. We should try to get
+ // value from the reqd_work_group_size function attribute if it is
+ // available.
+ unsigned WorkGroupSize = 256;
+ int AllocaSize = WorkGroupSize *
+ Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
+
+ if (AllocaSize > LocalMemAvailable) {
+ DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+ return;
+ }
+
+ DEBUG(dbgs() << "Promoting alloca to local memory\n");
+ LocalMemAvailable -= AllocaSize;
+
+ GlobalVariable *GV = new GlobalVariable(
+ *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
+ GlobalValue::ExternalLinkage, 0, I.getName(), 0,
+ GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+ FunctionType *FTy = FunctionType::get(
+ Type::getInt32Ty(Mod->getContext()), false);
+ AttributeSet AttrSet;
+ AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
+
+ Value *ReadLocalSizeY = Mod->getOrInsertFunction(
+ "llvm.r600.read.local.size.y", FTy, AttrSet);
+ Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
+ "llvm.r600.read.local.size.z", FTy, AttrSet);
+ Value *ReadTIDIGX = Mod->getOrInsertFunction(
+ "llvm.r600.read.tidig.x", FTy, AttrSet);
+ Value *ReadTIDIGY = Mod->getOrInsertFunction(
+ "llvm.r600.read.tidig.y", FTy, AttrSet);
+ Value *ReadTIDIGZ = Mod->getOrInsertFunction(
+ "llvm.r600.read.tidig.z", FTy, AttrSet);
+
+
+ Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
+ Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
+ Value *TIdX = Builder.CreateCall(ReadTIDIGX);
+ Value *TIdY = Builder.CreateCall(ReadTIDIGY);
+ Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
+
+ Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+ Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+ Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+ Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+ TID = Builder.CreateAdd(TID, TIdZ);
+
+ std::vector<Value*> Indices;
+ Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
+ Indices.push_back(TID);
+
+ Value *Offset = Builder.CreateGEP(GV, Indices);
+ I.mutateType(Offset->getType());
+ I.replaceAllUsesWith(Offset);
+ I.eraseFromParent();
+
+ std::vector<Value*> WorkList;
+
+ collectUsesWithPtrTypes(Offset, WorkList);
+
+ for (std::vector<Value*>::iterator i = WorkList.begin(),
+ e = WorkList.end(); i != e; ++i) {
+ Value *V = *i;
+ CallInst *Call = dyn_cast<CallInst>(V);
+ if (!Call) {
+ Type *EltTy = V->getType()->getPointerElementType();
+ PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+ V->mutateType(NewTy);
+ continue;
+ }
+
+ IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
+ if (!Intr) {
+ std::vector<Type*> ArgTypes;
+ for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
+ ArgIdx != ArgEnd; ++ArgIdx) {
+ ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
+ }
+ Function *F = Call->getCalledFunction();
+ FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
+ F->isVarArg());
+ Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
+ F->getAttributes());
+ Function *NewF = cast<Function>(C);
+ Call->setCalledFunction(NewF);
+ continue;
+ }
+
+ Builder.SetInsertPoint(Intr);
+ switch (Intr->getIntrinsicID()) {
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ // These intrinsics are for address space 0 only
+ Intr->eraseFromParent();
+ continue;
+ case Intrinsic::memcpy: {
+ MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+ Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+ MemCpy->getLength(), MemCpy->getAlignment(),
+ MemCpy->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ case Intrinsic::memset: {
+ MemSetInst *MemSet = cast<MemSetInst>(Intr);
+ Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+ MemSet->getLength(), MemSet->getAlignment(),
+ MemSet->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ default:
+ Intr->dump();
+ llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+ }
+ }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
+ return new AMDGPUPromoteAlloca(ST);
+}
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 19927fa..3433280 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -17,9 +17,9 @@
using namespace llvm;
-AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
+AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st)
: AMDGPUGenRegisterInfo(0),
- TM(tm)
+ ST(st)
{ }
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index a7cba0d..4731595 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -25,27 +25,19 @@
namespace llvm {
-class AMDGPUTargetMachine;
+class AMDGPUSubtarget;
class TargetInstrInfo;
struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
- TargetMachine &TM;
static const MCPhysReg CalleeSavedReg;
+ const AMDGPUSubtarget &ST;
- AMDGPURegisterInfo(TargetMachine &tm);
+ AMDGPURegisterInfo(const AMDGPUSubtarget &st);
BitVector getReservedRegs(const MachineFunction &MF) const override {
assert(!"Unimplemented"); return BitVector();
}
- /// \param RC is an AMDIL reg class.
- ///
- /// \returns The ISA reg class that is equivalent to \p RC.
- virtual const TargetRegisterClass * getISARegClass(
- const TargetRegisterClass * RC) const {
- assert(!"Unimplemented"); return nullptr;
- }
-
virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
assert(!"Unimplemented"); return nullptr;
}
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index f3b9932..b83c290 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,6 +13,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
using namespace llvm;
@@ -23,90 +25,42 @@ using namespace llvm;
#define GET_SUBTARGETINFO_CTOR
#include "AMDGPUGenSubtargetInfo.inc"
-AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
- AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
- InstrItins = getInstrItineraryForCPU(CPU);
-
- // Default card
- StringRef GPU = CPU;
- Is64bit = false;
- HasVertexCache = false;
- TexVTXClauseSize = 0;
- Gen = AMDGPUSubtarget::R600;
- FP64 = false;
- CaymanISA = false;
- EnableIRStructurizer = true;
- EnableIfCvt = true;
- WavefrontSize = 0;
- CFALUBug = false;
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) :
+ AMDGPUGenSubtargetInfo(TT, GPU, FS),
+ DevName(GPU),
+ Is64bit(false),
+ DumpCode(false),
+ R600ALUInst(false),
+ HasVertexCache(false),
+ TexVTXClauseSize(0),
+ Gen(AMDGPUSubtarget::R600),
+ FP64(false),
+ CaymanISA(false),
+ EnableIRStructurizer(true),
+ EnableIfCvt(true),
+ WavefrontSize(0),
+ CFALUBug(false),
+ LocalMemorySize(0),
+ InstrItins(getInstrItineraryForCPU(GPU)) {
ParseSubtargetFeatures(GPU, FS);
- DevName = GPU;
-}
-bool
-AMDGPUSubtarget::is64bit() const {
- return Is64bit;
-}
-bool
-AMDGPUSubtarget::hasVertexCache() const {
- return HasVertexCache;
-}
-short
-AMDGPUSubtarget::getTexVTXClauseSize() const {
- return TexVTXClauseSize;
-}
-enum AMDGPUSubtarget::Generation
-AMDGPUSubtarget::getGeneration() const {
- return Gen;
-}
-bool
-AMDGPUSubtarget::hasHWFP64() const {
- return FP64;
-}
-bool
-AMDGPUSubtarget::hasCaymanISA() const {
- return CaymanISA;
-}
-bool
-AMDGPUSubtarget::IsIRStructurizerEnabled() const {
- return EnableIRStructurizer;
-}
-bool
-AMDGPUSubtarget::isIfCvtEnabled() const {
- return EnableIfCvt;
-}
-unsigned
-AMDGPUSubtarget::getWavefrontSize() const {
- return WavefrontSize;
+ if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ InstrInfo.reset(new R600InstrInfo(*this));
+ } else {
+ InstrInfo.reset(new SIInstrInfo(*this));
+ }
}
-unsigned
-AMDGPUSubtarget::getStackEntrySize() const {
+
+unsigned AMDGPUSubtarget::getStackEntrySize() const {
assert(getGeneration() <= NORTHERN_ISLANDS);
switch(getWavefrontSize()) {
case 16:
return 8;
case 32:
- if (hasCaymanISA())
- return 4;
- else
- return 8;
+ return hasCaymanISA() ? 4 : 8;
case 64:
return 4;
default:
llvm_unreachable("Illegal wavefront size.");
}
}
-bool
-AMDGPUSubtarget::hasCFAluBug() const {
- assert(getGeneration() <= NORTHERN_ISLANDS);
- return CFALUBug;
-}
-bool
-AMDGPUSubtarget::isTargetELF() const {
- return false;
-}
-
-std::string
-AMDGPUSubtarget::getDeviceName() const {
- return DevName;
-}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 1b041d6..0c388b3 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -15,6 +15,7 @@
#ifndef AMDGPUSUBTARGET_H
#define AMDGPUSUBTARGET_H
#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Target/TargetSubtargetInfo.h"
@@ -27,6 +28,9 @@
namespace llvm {
class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+
+ std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+
public:
enum Generation {
R600 = 0,
@@ -40,42 +44,78 @@ public:
private:
std::string DevName;
bool Is64bit;
- bool Is32on64bit;
bool DumpCode;
bool R600ALUInst;
bool HasVertexCache;
short TexVTXClauseSize;
- enum Generation Gen;
+ Generation Gen;
bool FP64;
bool CaymanISA;
bool EnableIRStructurizer;
bool EnableIfCvt;
unsigned WavefrontSize;
bool CFALUBug;
+ int LocalMemorySize;
InstrItineraryData InstrItins;
public:
AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
- const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+ const AMDGPUInstrInfo *getInstrInfo() const {
+ return InstrInfo.get();
+ }
+
+ const InstrItineraryData &getInstrItineraryData() const {
+ return InstrItins;
+ }
+
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
- bool is64bit() const;
- bool hasVertexCache() const;
- short getTexVTXClauseSize() const;
- enum Generation getGeneration() const;
- bool hasHWFP64() const;
- bool hasCaymanISA() const;
+ bool is64bit() const {
+ return Is64bit;
+ }
+
+ bool hasVertexCache() const {
+ return HasVertexCache;
+ }
+
+ short getTexVTXClauseSize() const {
+ return TexVTXClauseSize;
+ }
+
+ Generation getGeneration() const {
+ return Gen;
+ }
+
+ bool hasHWFP64() const {
+ return FP64;
+ }
+
+ bool hasCaymanISA() const {
+ return CaymanISA;
+ }
bool hasBFE() const {
return (getGeneration() >= EVERGREEN);
}
+ bool hasBFI() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
bool hasBFM() const {
return hasBFE();
}
+ bool hasBCNT(unsigned Size) const {
+ if (Size == 32)
+ return (getGeneration() >= EVERGREEN);
+
+ assert(Size == 64);
+ return (getGeneration() >= SOUTHERN_ISLANDS);
+ }
+
bool hasMulU24() const {
return (getGeneration() >= EVERGREEN);
}
@@ -85,22 +125,48 @@ public:
hasCaymanISA());
}
- bool IsIRStructurizerEnabled() const;
- bool isIfCvtEnabled() const;
- unsigned getWavefrontSize() const;
+ bool IsIRStructurizerEnabled() const {
+ return EnableIRStructurizer;
+ }
+
+ bool isIfCvtEnabled() const {
+ return EnableIfCvt;
+ }
+
+ unsigned getWavefrontSize() const {
+ return WavefrontSize;
+ }
+
unsigned getStackEntrySize() const;
- bool hasCFAluBug() const;
+
+ bool hasCFAluBug() const {
+ assert(getGeneration() <= NORTHERN_ISLANDS);
+ return CFALUBug;
+ }
+
+ int getLocalMemorySize() const {
+ return LocalMemorySize;
+ }
bool enableMachineScheduler() const override {
return getGeneration() <= NORTHERN_ISLANDS;
}
// Helper functions to simplify if statements
- bool isTargetELF() const;
- std::string getDeviceName() const;
- bool dumpCode() const { return DumpCode; }
- bool r600ALUEncoding() const { return R600ALUInst; }
+ bool isTargetELF() const {
+ return false;
+ }
+ StringRef getDeviceName() const {
+ return DevName;
+ }
+
+ bool dumpCode() const {
+ return DumpCode;
+ }
+ bool r600ALUEncoding() const {
+ return R600ALUInst;
+ }
};
} // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 174fdca..8aab944 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -80,10 +80,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
InstrItins(&Subtarget.getInstrItineraryData()) {
// TLInfo uses InstrInfo so it must be initialized after.
if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- InstrInfo.reset(new R600InstrInfo(*this));
TLInfo.reset(new R600TargetLowering(*this));
} else {
- InstrInfo.reset(new SIInstrInfo(*this));
TLInfo.reset(new SITargetLowering(*this));
}
setRequiresStructuredCFG(true);
@@ -111,6 +109,7 @@ public:
return nullptr;
}
+ virtual void addCodeGenPrepare();
bool addPreISel() override;
bool addInstSelector() override;
bool addPreRegAlloc() override;
@@ -136,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
PM.add(createAMDGPUTargetTransformInfoPass(this));
}
+void AMDGPUPassConfig::addCodeGenPrepare() {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+ addPass(createAMDGPUPromoteAlloca(ST));
+ addPass(createSROAPass());
+ TargetPassConfig::addCodeGenPrepare();
+}
+
bool
AMDGPUPassConfig::addPreISel() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
@@ -159,7 +165,6 @@ bool AMDGPUPassConfig::addInstSelector() {
}
bool AMDGPUPassConfig::addPreRegAlloc() {
- addPass(createAMDGPUConvertToISAPass(*TM));
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
@@ -169,6 +174,8 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
// SIFixSGPRCopies can generate a lot of duplicate instructions,
// so we need to run MachineCSE afterwards.
addPass(&MachineCSEID);
+ initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+ insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID);
}
return false;
}
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 1287e13..3bb15be 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -17,8 +17,8 @@
#include "AMDGPUFrameLowering.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
#include "R600ISelLowering.h"
#include "llvm/IR/DataLayout.h"
@@ -30,7 +30,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
const DataLayout Layout;
AMDGPUFrameLowering FrameLowering;
AMDGPUIntrinsicInfo IntrinsicInfo;
- std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;
const InstrItineraryData *InstrItins;
@@ -46,13 +45,13 @@ public:
return &IntrinsicInfo;
}
const AMDGPUInstrInfo *getInstrInfo() const override {
- return InstrInfo.get();
+ return getSubtargetImpl()->getInstrInfo();
}
const AMDGPUSubtarget *getSubtargetImpl() const override {
return &Subtarget;
}
const AMDGPURegisterInfo *getRegisterInfo() const override {
- return &InstrInfo->getRegisterInfo();
+ return &getInstrInfo()->getRegisterInfo();
}
AMDGPUTargetLowering *getTargetLowering() const override {
return TLInfo.get();
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
deleted file mode 100644
index 5dcd478..0000000
--- a/lib/Target/R600/AMDILBase.td
+++ /dev/null
@@ -1,25 +0,0 @@
-//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-// Dummy Instruction itineraries for pseudo instructions
-def ALU_NULL : FuncUnit;
-def NullALU : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// Register File, Calling Conv, Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-
-include "AMDILRegisterInfo.td"
-include "AMDILInstrInfo.td"
-
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
deleted file mode 100644
index 7cea803..0000000
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ /dev/null
@@ -1,560 +0,0 @@
-//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief TargetLowering functions borrowed from AMDIL.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUISelLowering.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetOptions.h"
-
-using namespace llvm;
-//===----------------------------------------------------------------------===//
-// TargetLowering Implementation Help Functions End
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// TargetLowering Class Implementation Begins
-//===----------------------------------------------------------------------===//
-void AMDGPUTargetLowering::InitAMDILLowering() {
- static const MVT::SimpleValueType types[] = {
- MVT::i8,
- MVT::i16,
- MVT::i32,
- MVT::f32,
- MVT::f64,
- MVT::i64,
- MVT::v2i8,
- MVT::v4i8,
- MVT::v2i16,
- MVT::v4i16,
- MVT::v4f32,
- MVT::v4i32,
- MVT::v2f32,
- MVT::v2i32,
- MVT::v2f64,
- MVT::v2i64
- };
-
- static const MVT::SimpleValueType IntTypes[] = {
- MVT::i8,
- MVT::i16,
- MVT::i32,
- MVT::i64
- };
-
- static const MVT::SimpleValueType FloatTypes[] = {
- MVT::f32,
- MVT::f64
- };
-
- static const MVT::SimpleValueType VectorTypes[] = {
- MVT::v2i8,
- MVT::v4i8,
- MVT::v2i16,
- MVT::v4i16,
- MVT::v4f32,
- MVT::v4i32,
- MVT::v2f32,
- MVT::v2i32,
- MVT::v2f64,
- MVT::v2i64
- };
-
- const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
- // These are the current register classes that are
- // supported
-
- for (MVT VT : types) {
- setOperationAction(ISD::SUBE, VT, Expand);
- setOperationAction(ISD::SUBC, VT, Expand);
- setOperationAction(ISD::ADDE, VT, Expand);
- setOperationAction(ISD::ADDC, VT, Expand);
- setOperationAction(ISD::BRCOND, VT, Custom);
- setOperationAction(ISD::BR_JT, VT, Expand);
- setOperationAction(ISD::BRIND, VT, Expand);
- // TODO: Implement custom UREM/SREM routines
- setOperationAction(ISD::SREM, VT, Expand);
- setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- setOperationAction(ISD::UMUL_LOHI, VT, Expand);
- if (VT != MVT::i64 && VT != MVT::v2i64) {
- setOperationAction(ISD::SDIV, VT, Custom);
- }
- }
- for (MVT VT : FloatTypes) {
- // IL does not have these operations for floating point types
- setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
- setOperationAction(ISD::SETOLT, VT, Expand);
- setOperationAction(ISD::SETOGE, VT, Expand);
- setOperationAction(ISD::SETOGT, VT, Expand);
- setOperationAction(ISD::SETOLE, VT, Expand);
- setOperationAction(ISD::SETULT, VT, Expand);
- setOperationAction(ISD::SETUGE, VT, Expand);
- setOperationAction(ISD::SETUGT, VT, Expand);
- setOperationAction(ISD::SETULE, VT, Expand);
- }
-
- for (MVT VT : IntTypes) {
- // GPU also does not have divrem function for signed or unsigned
- setOperationAction(ISD::SDIVREM, VT, Expand);
-
- // GPU does not have [S|U]MUL_LOHI functions as a single instruction
- setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-
- setOperationAction(ISD::BSWAP, VT, Expand);
-
- // GPU doesn't have any counting operators
- setOperationAction(ISD::CTPOP, VT, Expand);
- setOperationAction(ISD::CTTZ, VT, Expand);
- setOperationAction(ISD::CTLZ, VT, Expand);
- }
-
- for (MVT VT : VectorTypes) {
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
- setOperationAction(ISD::SDIVREM, VT, Expand);
- setOperationAction(ISD::SMUL_LOHI, VT, Expand);
- // setOperationAction(ISD::VSETCC, VT, Expand);
- setOperationAction(ISD::SELECT_CC, VT, Expand);
-
- }
- setOperationAction(ISD::MULHU, MVT::i64, Expand);
- setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
- setOperationAction(ISD::MULHS, MVT::i64, Expand);
- setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
- setOperationAction(ISD::ADD, MVT::v2i64, Expand);
- setOperationAction(ISD::SREM, MVT::v2i64, Expand);
- setOperationAction(ISD::Constant , MVT::i64 , Legal);
- setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
- if (STM.hasHWFP64()) {
- // we support loading/storing v2f64 but not operations on the type
- setOperationAction(ISD::FADD, MVT::v2f64, Expand);
- setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
- setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
- setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
- setOperationAction(ISD::ConstantFP , MVT::f64 , Legal);
- // We want to expand vector conversions into their scalar
- // counterparts.
- setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
- setOperationAction(ISD::FABS, MVT::f64, Expand);
- setOperationAction(ISD::FABS, MVT::v2f64, Expand);
- }
- // TODO: Fix the UDIV24 algorithm so it works for these
- // types correctly. This needs vector comparisons
- // for this to work correctly.
- setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
- setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
- setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
- setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
- setOperationAction(ISD::SUBC, MVT::Other, Expand);
- setOperationAction(ISD::ADDE, MVT::Other, Expand);
- setOperationAction(ISD::ADDC, MVT::Other, Expand);
- setOperationAction(ISD::BRCOND, MVT::Other, Custom);
- setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::BRIND, MVT::Other, Expand);
-
-
- // Use the default implementation.
- setOperationAction(ISD::ConstantFP , MVT::f32 , Legal);
- setOperationAction(ISD::Constant , MVT::i32 , Legal);
-
- setSchedulingPreference(Sched::RegPressure);
- setPow2DivIsCheap(false);
- setSelectIsExpensive(true);
- setJumpIsExpensive(true);
-
- MaxStoresPerMemcpy = 4096;
- MaxStoresPerMemmove = 4096;
- MaxStoresPerMemset = 4096;
-
-}
-
-bool
-AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &I, unsigned Intrinsic) const {
- return false;
-}
-
-// The backend supports 32 and 64 bit floating point immediates
-bool
-AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
- if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
- || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
- return true;
- } else {
- return false;
- }
-}
-
-bool
-AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
- if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
- || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
- return false;
- } else {
- return true;
- }
-}
-
-
-// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
-// be zero. Op is expected to be a target specific node. Used by DAG
-// combiner.
-
-//===----------------------------------------------------------------------===//
-// Other Lowering Hooks
-//===----------------------------------------------------------------------===//
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
- EVT OVT = Op.getValueType();
- SDValue DST;
- if (OVT.getScalarType() == MVT::i64) {
- DST = LowerSDIV64(Op, DAG);
- } else if (OVT.getScalarType() == MVT::i32) {
- DST = LowerSDIV32(Op, DAG);
- } else if (OVT.getScalarType() == MVT::i16
- || OVT.getScalarType() == MVT::i8) {
- DST = LowerSDIV24(Op, DAG);
- } else {
- DST = SDValue(Op.getNode(), 0);
- }
- return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
- EVT OVT = Op.getValueType();
- SDValue DST;
- if (OVT.getScalarType() == MVT::i64) {
- DST = LowerSREM64(Op, DAG);
- } else if (OVT.getScalarType() == MVT::i32) {
- DST = LowerSREM32(Op, DAG);
- } else if (OVT.getScalarType() == MVT::i16) {
- DST = LowerSREM16(Op, DAG);
- } else if (OVT.getScalarType() == MVT::i8) {
- DST = LowerSREM8(Op, DAG);
- } else {
- DST = SDValue(Op.getNode(), 0);
- }
- return DST;
-}
-
-EVT
-AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
- int iSize = (size * numEle);
- int vEle = (iSize >> ((size == 64) ? 6 : 5));
- if (!vEle) {
- vEle = 1;
- }
- if (size == 64) {
- if (vEle == 1) {
- return EVT(MVT::i64);
- } else {
- return EVT(MVT::getVectorVT(MVT::i64, vEle));
- }
- } else {
- if (vEle == 1) {
- return EVT(MVT::i32);
- } else {
- return EVT(MVT::getVectorVT(MVT::i32, vEle));
- }
- }
-}
-
-SDValue
-AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
- SDValue Chain = Op.getOperand(0);
- SDValue Cond = Op.getOperand(1);
- SDValue Jump = Op.getOperand(2);
- SDValue Result;
- Result = DAG.getNode(
- AMDGPUISD::BRANCH_COND,
- SDLoc(Op),
- Op.getValueType(),
- Chain, Jump, Cond);
- return Result;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- MVT INTTY;
- MVT FLTTY;
- if (!OVT.isVector()) {
- INTTY = MVT::i32;
- FLTTY = MVT::f32;
- } else if (OVT.getVectorNumElements() == 2) {
- INTTY = MVT::v2i32;
- FLTTY = MVT::v2f32;
- } else if (OVT.getVectorNumElements() == 4) {
- INTTY = MVT::v4i32;
- FLTTY = MVT::v4f32;
- }
- unsigned bitsize = OVT.getScalarType().getSizeInBits();
- // char|short jq = ia ^ ib;
- SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
-
- // jq = jq >> (bitsize - 2)
- jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
-
- // jq = jq | 0x1
- jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
-
- // jq = (int)jq
- jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
-
- // int ia = (int)LHS;
- SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
-
- // int ib, (int)RHS;
- SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
-
- // float fa = (float)ia;
- SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
-
- // float fb = (float)ib;
- SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
-
- // float fq = native_divide(fa, fb);
- SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
-
- // fq = trunc(fq);
- fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
-
- // float fqneg = -fq;
- SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
-
- // float fr = mad(fqneg, fb, fa);
- SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
- DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
-
- // int iq = (int)fq;
- SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
-
- // fr = fabs(fr);
- fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
-
- // fb = fabs(fb);
- fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
-
- // int cv = fr >= fb;
- SDValue cv;
- if (INTTY == MVT::i32) {
- cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
- } else {
- cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
- }
- // jq = (cv ? jq : 0);
- jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
- DAG.getConstant(0, OVT));
- // dst = iq + jq;
- iq = DAG.getSExtOrTrunc(iq, DL, OVT);
- iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
- return iq;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- // The LowerSDIV32 function generates equivalent to the following IL.
- // mov r0, LHS
- // mov r1, RHS
- // ilt r10, r0, 0
- // ilt r11, r1, 0
- // iadd r0, r0, r10
- // iadd r1, r1, r11
- // ixor r0, r0, r10
- // ixor r1, r1, r11
- // udiv r0, r0, r1
- // ixor r10, r10, r11
- // iadd r0, r0, r10
- // ixor DST, r0, r10
-
- // mov r0, LHS
- SDValue r0 = LHS;
-
- // mov r1, RHS
- SDValue r1 = RHS;
-
- // ilt r10, r0, 0
- SDValue r10 = DAG.getSelectCC(DL,
- r0, DAG.getConstant(0, OVT),
- DAG.getConstant(-1, MVT::i32),
- DAG.getConstant(0, MVT::i32),
- ISD::SETLT);
-
- // ilt r11, r1, 0
- SDValue r11 = DAG.getSelectCC(DL,
- r1, DAG.getConstant(0, OVT),
- DAG.getConstant(-1, MVT::i32),
- DAG.getConstant(0, MVT::i32),
- ISD::SETLT);
-
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
- // iadd r1, r1, r11
- r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
- // ixor r0, r0, r10
- r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
- // ixor r1, r1, r11
- r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
- // udiv r0, r0, r1
- r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
-
- // ixor r10, r10, r11
- r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
-
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
- // ixor DST, r0, r10
- SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
- return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
- return SDValue(Op.getNode(), 0);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- MVT INTTY = MVT::i32;
- if (OVT == MVT::v2i8) {
- INTTY = MVT::v2i32;
- } else if (OVT == MVT::v4i8) {
- INTTY = MVT::v4i32;
- }
- SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
- SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
- LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
- LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
- return LHS;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- MVT INTTY = MVT::i32;
- if (OVT == MVT::v2i16) {
- INTTY = MVT::v2i32;
- } else if (OVT == MVT::v4i16) {
- INTTY = MVT::v4i32;
- }
- SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
- SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
- LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
- LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
- return LHS;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- // The LowerSREM32 function generates equivalent to the following IL.
- // mov r0, LHS
- // mov r1, RHS
- // ilt r10, r0, 0
- // ilt r11, r1, 0
- // iadd r0, r0, r10
- // iadd r1, r1, r11
- // ixor r0, r0, r10
- // ixor r1, r1, r11
- // udiv r20, r0, r1
- // umul r20, r20, r1
- // sub r0, r0, r20
- // iadd r0, r0, r10
- // ixor DST, r0, r10
-
- // mov r0, LHS
- SDValue r0 = LHS;
-
- // mov r1, RHS
- SDValue r1 = RHS;
-
- // ilt r10, r0, 0
- SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
-
- // ilt r11, r1, 0
- SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
-
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
- // iadd r1, r1, r11
- r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
- // ixor r0, r0, r10
- r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
- // ixor r1, r1, r11
- r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
- // udiv r20, r0, r1
- SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
-
- // umul r20, r20, r1
- r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
-
- // sub r0, r0, r20
- r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
-
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
- // ixor DST, r0, r10
- SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
- return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
- return SDValue(Op.getNode(), 0);
-}
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
deleted file mode 100644
index 0f0c88d..0000000
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ /dev/null
@@ -1,150 +0,0 @@
-//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file describes the AMDIL instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Custom Operands
-//===--------------------------------------------------------------------===//
-def brtarget : Operand<OtherVT>;
-
-//===--------------------------------------------------------------------===//
-// Custom Selection DAG Type Profiles
-//===--------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Generic Profile Types
-//===----------------------------------------------------------------------===//
-
-def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
- SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
- ]>;
-def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
- SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
- ]>;
-def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
- SDTCisEltOfVec<1, 0>
- ]>;
-
-//===----------------------------------------------------------------------===//
-// Flow Control Profile Types
-//===----------------------------------------------------------------------===//
-// Branch instruction where second and third are basic blocks
-def SDTIL_BRCond : SDTypeProfile<0, 2, [
- SDTCisVT<0, OtherVT>
- ]>;
-
-//===--------------------------------------------------------------------===//
-// Custom Selection DAG Nodes
-//===--------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Flow Control DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
-
-//===----------------------------------------------------------------------===//
-// Call/Return DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
-
-//===--------------------------------------------------------------------===//
-// Instructions
-//===--------------------------------------------------------------------===//
-// Floating point math functions
-def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
-
-//===----------------------------------------------------------------------===//
-// Integer functions
-//===----------------------------------------------------------------------===//
-def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp,
- [SDNPCommutative, SDNPAssociative]>;
-
-//===--------------------------------------------------------------------===//
-// Custom Pattern DAG Nodes
-//===--------------------------------------------------------------------===//
-def global_store : PatFrag<(ops node:$val, node:$ptr),
- (store node:$val, node:$ptr), [{
- return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Load pattern fragments
-//===----------------------------------------------------------------------===//
-// Global address space loads
-def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-// Constant address space loads
-def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Complex addressing mode patterns
-//===----------------------------------------------------------------------===//
-def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
-def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
-def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
-def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
-
-//===----------------------------------------------------------------------===//
-// Instruction format classes
-//===----------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
- let Namespace = "AMDGPU";
- dag OutOperandList = outs;
- dag InOperandList = ins;
- let Pattern = pattern;
- let AsmString = !strconcat(asmstr, "\n");
- let isPseudo = 1;
- let Itinerary = NullALU;
- bit hasIEEEFlag = 0;
- bit hasZeroOpFlag = 0;
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
-}
-
-//===--------------------------------------------------------------------===//
-// Multiclass Instruction formats
-//===--------------------------------------------------------------------===//
-// Multiclass that handles branch instructions
-multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
- def _i32 : ILFormat<(outs),
- (ins brtarget:$target, rci:$src0),
- "; i32 Pseudo branch instruction",
- [(Op bb:$target, (i32 rci:$src0))]>;
- def _f32 : ILFormat<(outs),
- (ins brtarget:$target, rcf:$src0),
- "; f32 Pseudo branch instruction",
- [(Op bb:$target, (f32 rcf:$src0))]>;
-}
-
-// Only scalar types should generate flow control
-multiclass BranchInstr<string name> {
- def _i32 : ILFormat<(outs), (ins GPRI32:$src),
- !strconcat(name, " $src"), []>;
- def _f32 : ILFormat<(outs), (ins GPRF32:$src),
- !strconcat(name, " $src"), []>;
-}
-// Only scalar types should generate flow control
-multiclass BranchInstr2<string name> {
- def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
- !strconcat(name, " $src0, $src1"), []>;
- def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
- !strconcat(name, " $src0, $src1"), []>;
-}
-
-//===--------------------------------------------------------------------===//
-// Intrinsics support
-//===--------------------------------------------------------------------===//
-include "AMDILIntrinsics.td"
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
deleted file mode 100644
index 4a3e02e..0000000
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ /dev/null
@@ -1,224 +0,0 @@
-//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file defines all of the amdil-specific intrinsics
-//
-//===---------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Intrinsic classes
-// Generic versions of the above classes but for Target specific intrinsics
-// instead of SDNode patterns.
-//===--------------------------------------------------------------------===//
-let TargetPrefix = "AMDIL", isTarget = 1 in {
- class VoidIntLong :
- Intrinsic<[llvm_i64_ty], [], []>;
- class VoidIntInt :
- Intrinsic<[llvm_i32_ty], [], []>;
- class VoidIntBool :
- Intrinsic<[llvm_i32_ty], [], []>;
- class UnaryIntInt :
- Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
- class UnaryIntFloat :
- Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
- class ConvertIntFTOI :
- Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
- class ConvertIntITOF :
- Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
- class UnaryIntNoRetInt :
- Intrinsic<[], [llvm_anyint_ty], []>;
- class UnaryIntNoRetFloat :
- Intrinsic<[], [llvm_anyfloat_ty], []>;
- class BinaryIntInt :
- Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class BinaryIntFloat :
- Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class BinaryIntNoRetInt :
- Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
- class BinaryIntNoRetFloat :
- Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
- class TernaryIntInt :
- Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
- LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class TernaryIntFloat :
- Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
- LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class QuaternaryIntInt :
- Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
- LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class UnaryAtomicInt :
- Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
- class BinaryAtomicInt :
- Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
- class TernaryAtomicInt :
- Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
- class UnaryAtomicIntNoRet :
- Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
- class BinaryAtomicIntNoRet :
- Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
- class TernaryAtomicIntNoRet :
- Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-}
-
-let TargetPrefix = "AMDIL", isTarget = 1 in {
- def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
-
- def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
- UnaryIntInt;
- def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
- UnaryIntInt;
- def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
- UnaryIntInt;
- def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
- UnaryIntInt;
- def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
- UnaryIntInt;
- def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
- TernaryIntInt;
- def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
- TernaryIntInt;
- def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
- QuaternaryIntInt;
- def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
- TernaryIntInt;
- def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
- BinaryIntInt;
- def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
- BinaryIntInt;
- def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
- BinaryIntInt;
- def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
- BinaryIntInt;
- def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
- BinaryIntInt;
- def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
- BinaryIntInt;
- def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
- BinaryIntInt;
- def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
- BinaryIntInt;
- def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
- BinaryIntInt;
- def int_AMDIL_min : GCCBuiltin<"__amdil_min">,
- BinaryIntFloat;
- def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
- BinaryIntInt;
- def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
- BinaryIntInt;
- def int_AMDIL_max : GCCBuiltin<"__amdil_max">,
- BinaryIntFloat;
- def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
- TernaryIntInt;
- def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
- TernaryIntInt;
- def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
- TernaryIntInt;
- def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
- UnaryIntFloat;
- def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
- TernaryIntFloat;
- def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
- UnaryIntFloat;
- def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
- UnaryIntFloat;
- def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
- UnaryIntFloat;
- def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
- UnaryIntFloat;
- def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
- UnaryIntFloat;
- def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
- UnaryIntFloat;
- def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
- UnaryIntFloat;
- def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
- UnaryIntFloat;
- def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
- UnaryIntFloat;
- def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
- UnaryIntFloat;
- def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
- UnaryIntFloat;
- def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
- UnaryIntFloat;
- def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
- def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
- def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
- def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
- UnaryIntFloat;
- def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
- UnaryIntFloat;
- def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
- UnaryIntFloat;
- def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
- UnaryIntFloat;
- def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
- UnaryIntFloat;
- def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
- UnaryIntFloat;
- def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
- UnaryIntFloat;
- def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
- UnaryIntFloat;
- def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
- TernaryIntFloat;
- def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
- UnaryIntFloat;
- def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
- UnaryIntFloat;
- def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
- UnaryIntFloat;
- def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
- TernaryIntFloat;
- def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
- Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
- llvm_v4i32_ty, llvm_i32_ty], []>;
-
- def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
- Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
- def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
- Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
- def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
- Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
- def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
- ConvertIntITOF;
- def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
- ConvertIntFTOI;
- def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
- ConvertIntFTOI;
- def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
- ConvertIntFTOI;
- def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
- ConvertIntFTOI;
- def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
- ConvertIntFTOI;
- def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
- ConvertIntFTOI;
- def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
- Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
- def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
- ConvertIntITOF;
- def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
- ConvertIntITOF;
- def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
- ConvertIntITOF;
- def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
- ConvertIntITOF;
- def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
- Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
- llvm_v2f32_ty, llvm_float_ty], []>;
- def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
- Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
- llvm_v2f32_ty], []>;
- def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
- Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
- llvm_v4f32_ty], []>;
- def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
- Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
- llvm_v4f32_ty], []>;
-}
diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td
deleted file mode 100644
index b9d0334..0000000
--- a/lib/Target/R600/AMDILRegisterInfo.td
+++ /dev/null
@@ -1,107 +0,0 @@
-//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// Declarations that describe the AMDIL register file
-//
-//===----------------------------------------------------------------------===//
-
-class AMDILReg<bits<16> num, string n> : Register<n> {
- field bits<16> Value;
- let Value = num;
- let Namespace = "AMDGPU";
-}
-
-// We will start with 8 registers for each class before expanding to more
-// Since the swizzle is added based on the register class, we can leave it
-// off here and just specify different registers for different register classes
-def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
-def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
-def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
-def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
-def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
-def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
-def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
-def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
-def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
-def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
-def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
-def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
-def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
-def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
-def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
-def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
-def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
-def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
-def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
-def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
-
-// All registers between 1000 and 1024 are reserved and cannot be used
-// unless commented in this section
-// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
-// r1020 is used to hold the frame index for local arrays
-// r1019 is used to hold the dynamic stack allocation pointer
-// r1018 is used as a temporary register for handwritten code
-// r1017 is used as a temporary register for handwritten code
-// r1016 is used as a temporary register for load/store code
-// r1015 is used as a temporary register for data segment offset
-// r1014 is used as a temporary register for store code
-// r1013 is used as the section data pointer register
-// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
-// r1009 is used as the frame pointer register
-// r999 is used as the mem register.
-// r998 is used as the return address register.
-//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
-//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
-//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
-//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
-//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
-//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
-def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
-def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
-def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
-def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
-def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
-def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
-def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
-def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
-def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
-def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
-def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
-def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
-def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
-def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
-def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
-def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
-def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
-def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
-def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
-def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
-def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
-def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
-def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
- (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
- let AltOrders = [(add (sequence "R%u", 1, 20))];
- let AltOrderSelect = [{
- return 1;
- }];
- }
-def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
- (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
- let AltOrders = [(add (sequence "R%u", 1, 20))];
- let AltOrderSelect = [{
- return 1;
- }];
- }
-def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
- (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
- let AltOrders = [(add (sequence "R%u", 1, 20))];
- let AltOrderSelect = [{
- return 1;
- }];
- }
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 3c6fa5a..4d16082 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -13,10 +13,9 @@ add_public_tablegen_target(AMDGPUCommonTableGen)
add_llvm_target(R600CodeGen
AMDILCFGStructurizer.cpp
- AMDILIntrinsicInfo.cpp
- AMDILISelLowering.cpp
AMDGPUAsmPrinter.cpp
AMDGPUFrameLowering.cpp
+ AMDGPUIntrinsicInfo.cpp
AMDGPUISelDAGToDAG.cpp
AMDGPUMCInstLower.cpp
AMDGPUMachineFunction.cpp
@@ -24,8 +23,8 @@ add_llvm_target(R600CodeGen
AMDGPUTargetMachine.cpp
AMDGPUTargetTransformInfo.cpp
AMDGPUISelLowering.cpp
- AMDGPUConvertToISA.cpp
AMDGPUInstrInfo.cpp
+ AMDGPUPromoteAlloca.cpp
AMDGPURegisterInfo.cpp
R600ClauseMergePass.cpp
R600ControlFlowFinalizer.cpp
@@ -41,6 +40,7 @@ add_llvm_target(R600CodeGen
R600TextureIntrinsicsReplacer.cpp
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
+ SIFixSGPRLiveRanges.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 2065441..dcb7e98 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -295,7 +295,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)),
def : Pat<(i32 (sext_inreg i32:$src, i16)),
(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
-defm : BFIPatterns <BFI_INT_eg>;
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>;
def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
[(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -326,6 +326,8 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
def DOT4_eg : DOT4_Common<0xBE>;
defm CUBE_eg : CUBE_Common<0xC0>;
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
let hasSideEffects = 1 in {
def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
}
@@ -346,7 +348,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
def GROUP_BARRIER : InstR600 <
- (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>,
+ (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
R600ALU_Word0,
R600ALU_Word1_OP2 <0x54> {
@@ -375,6 +377,11 @@ def GROUP_BARRIER : InstR600 <
let ALUInst = 1;
}
+def : Pat <
+ (int_AMDGPU_barrier_global),
+ (GROUP_BARRIER)
+>;
+
//===----------------------------------------------------------------------===//
// LDS Instructions
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 11ae091..0927040 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -99,9 +99,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
return;
}
- // The low 8 bits encoding value is the register index, for both VGPRs and
- // SGPRs.
- unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
+ // The low 8 bits of the encoding value is the register index, for both VGPRs
+ // and SGPRs.
+ unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
if (NumRegs == 1) {
O << Type << RegIdx;
return;
@@ -216,13 +216,8 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
- union Literal {
- float f;
- int32_t i;
- } L;
-
- L.i = MI->getOperand(OpNo).getImm();
- O << L.i << "(" << L.f << ")";
+ int32_t Imm = MI->getOperand(OpNo).getImm();
+ O << Imm << '(' << BitsToFloat(Imm) << ')';
}
void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 5e7cefe..dc1344f 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -172,17 +172,13 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
SmallVectorImpl<MCFixup> &Fixup,
const MCSubtargetInfo &STI) const {
if (MO.isReg()) {
- if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
+ if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
return MRI.getEncodingValue(MO.getReg());
- } else {
- return getHWReg(MO.getReg());
- }
- } else if (MO.isImm()) {
- return MO.getImm();
- } else {
- assert(0);
- return 0;
+ return getHWReg(MO.getReg());
}
+
+ assert(MO.isImm());
+ return MO.getImm();
}
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index d255e96..d98a6db 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -14,6 +14,7 @@
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index d6c6830..7f3560a 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -13,6 +13,9 @@
//===----------------------------------------------------------------------===//
#include "R600ISelLowering.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
@@ -65,6 +68,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::FSUB, MVT::f32, Expand);
@@ -133,19 +137,47 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
setTargetDAGCombine(ISD::FP_ROUND);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setOperationAction(ISD::SUB, MVT::i64, Expand);
+
// These should be replaced by UDVIREM, but it does not happen automatically
// during Type Legalization
setOperationAction(ISD::UDIV, MVT::i64, Custom);
setOperationAction(ISD::UREM, MVT::i64, Custom);
+ setOperationAction(ISD::SDIV, MVT::i64, Custom);
+ setOperationAction(ISD::SREM, MVT::i64, Custom);
+
+ // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+ // to be Legal/Custom in order to avoid library calls.
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+ for (MVT VT : ScalarIntVTs) {
+ setOperationAction(ISD::ADDC, VT, Expand);
+ setOperationAction(ISD::SUBC, VT, Expand);
+ setOperationAction(ISD::ADDE, VT, Expand);
+ setOperationAction(ISD::SUBE, VT, Expand);
+ }
+
setBooleanContents(ZeroOrNegativeOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::Source);
@@ -537,11 +569,24 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
case ISD::FCOS:
case ISD::FSIN: return LowerTrig(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
- case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::LOAD: {
+ SDValue Result = LowerLOAD(Op, DAG);
+ assert((!Result.getNode() ||
+ Result.getNode()->getNumValues() == 2) &&
+ "Load should return a value and a chain");
+ return Result;
+ }
+
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
case ISD::INTRINSIC_VOID: {
SDValue Chain = Op.getOperand(0);
@@ -776,6 +821,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
AMDGPU::T0_Z, VT);
+ case Intrinsic::AMDGPU_rsq:
+ // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
+ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
}
// break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
break;
@@ -793,20 +841,172 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
return;
case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
return;
- case ISD::LOAD: {
- SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
- Results.push_back(SDValue(Node, 0));
- Results.push_back(SDValue(Node, 1));
- // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
- // function
- DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
- return;
+ case ISD::UDIV: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+ N->getOperand(0), N->getOperand(1));
+ Results.push_back(UDIVREM);
+ break;
}
- case ISD::STORE:
- SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
- Results.push_back(SDValue(Node, 0));
- return;
+ case ISD::UREM: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+ N->getOperand(0), N->getOperand(1));
+ Results.push_back(UDIVREM.getValue(1));
+ break;
+ }
+ case ISD::SDIV: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
+ N->getOperand(0), N->getOperand(1));
+ Results.push_back(SDIVREM);
+ break;
+ }
+ case ISD::SREM: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
+ N->getOperand(0), N->getOperand(1));
+ Results.push_back(SDIVREM.getValue(1));
+ break;
+ }
+ case ISD::SDIVREM: {
+ SDValue Op = SDValue(N, 1);
+ SDValue RES = LowerSDIVREM(Op, DAG);
+ Results.push_back(RES);
+ Results.push_back(RES.getValue(1));
+ break;
+ }
+ case ISD::UDIVREM: {
+ SDValue Op = SDValue(N, 0);
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+ SDValue one = DAG.getConstant(1, HalfVT);
+ SDValue zero = DAG.getConstant(0, HalfVT);
+
+ //HiLo split
+ SDValue LHS = N->getOperand(0);
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+ SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+ SDValue RHS = N->getOperand(1);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+ SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+ // Get Speculative values
+ SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+ SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+ SDValue REM_Hi = zero;
+ SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+ SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+ SDValue DIV_Lo = zero;
+
+ const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+ for (unsigned i = 0; i < halfBitWidth; ++i) {
+ SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+ // Get Value of high bit
+ SDValue HBit;
+ if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+ HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+ } else {
+ HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+ HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+ }
+
+ SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+ DAG.getConstant(halfBitWidth - 1, HalfVT));
+ REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+ REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+ REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+ REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+ SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+ SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+ DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+ // Update REM
+
+ SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+ REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+ REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+ REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+ }
+
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+ SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+ Results.push_back(DIV);
+ Results.push_back(REM);
+ break;
}
+ }
+}
+
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+ SDValue Vector) const {
+
+ SDLoc DL(Vector);
+ EVT VecVT = Vector.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ SmallVector<SDValue, 8> Args;
+
+ for (unsigned i = 0, e = VecVT.getVectorNumElements();
+ i != e; ++i) {
+ Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+ Vector, DAG.getConstant(i, getVectorIdxTy())));
+ }
+
+ return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SDLoc DL(Op);
+ SDValue Vector = Op.getOperand(0);
+ SDValue Index = Op.getOperand(1);
+
+ if (isa<ConstantSDNode>(Index) ||
+ Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ return Op;
+
+ Vector = vectorToVerticalVector(DAG, Vector);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+ Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vector = Op.getOperand(0);
+ SDValue Value = Op.getOperand(1);
+ SDValue Index = Op.getOperand(2);
+
+ if (isa<ConstantSDNode>(Index) ||
+ Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ return Op;
+
+ Vector = vectorToVerticalVector(DAG, Vector);
+ SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+ Vector, Value, Index);
+ return vectorToVerticalVector(DAG, Insert);
}
SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -840,6 +1040,80 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstantFP(3.14159265359, MVT::f32));
}
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Shift = Op.getOperand(2);
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue One = DAG.getConstant(1, VT);
+
+ SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT);
+ SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+ SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+ SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+ // The dance around Width1 is necessary for 0 special case.
+ // Without it the CompShift might be 32, producing incorrect results in
+ // Overflow. So we do the shift in two steps, the alternative is to
+ // add a conditional to filter the special case.
+
+ SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+ Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+ SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+ HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+ SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+ SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+ SDValue LoBig = Zero;
+
+ Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+ Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Shift = Op.getOperand(2);
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue One = DAG.getConstant(1, VT);
+
+ const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+ SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT);
+ SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+ SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+ SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+ // The dance around Width1 is necessary for 0 special case.
+ // Without it the CompShift might be 32, producing incorrect results in
+ // Overflow. So we do the shift in two steps, the alternative is to
+ // add a conditional to filter the special case.
+
+ SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+ Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+ SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+ SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+ LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+ SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+ SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+ Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+ Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(
ISD::SETCC,
@@ -1369,6 +1643,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
return DAG.getMergeValues(Ops, DL);
}
+SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Jump = Op.getOperand(2);
+
+ return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+ Chain, Jump, Cond);
+}
+
/// XXX Only kernel functions are supported, so we can assume for now that
/// every function is a kernel function, but in the future we should use
/// separate calling conventions for kernel and non-kernel functions.
@@ -1902,9 +2185,8 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SDValue FakeOp;
std::vector<SDValue> Ops;
- for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
- I != E; ++I)
- Ops.push_back(*I);
+ for (const SDUse &I : Node->ops())
+ Ops.push_back(I);
if (Opcode == AMDGPU::DOT_4) {
int OperandIdx[] = {
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index a8a464f..d22c8c9 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -51,15 +51,18 @@ private:
void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
MachineRegisterInfo & MRI, unsigned dword_offset) const;
SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
+ SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
- /// \brief Lower ROTL opcode to BITALIGN
- SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
-
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index b0d9ae3..3972e2f 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -28,10 +28,9 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "AMDGPUGenDFAPacketizer.inc"
-R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
- : AMDGPUInstrInfo(tm),
- RI(tm),
- ST(tm.getSubtarget<AMDGPUSubtarget>())
+R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
+ : AMDGPUInstrInfo(st),
+ RI(st)
{ }
const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
@@ -52,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
unsigned VectorComponents = 0;
- if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
- AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+ if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+ AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+ (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+ AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
VectorComponents = 4;
- } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
- AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+ } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+ AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+ (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+ AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
VectorComponents = 2;
}
@@ -768,16 +771,6 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
return true;
}
-int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
- const MachineInstr *MI = op.getParent();
-
- switch (MI->getDesc().OpInfo->RegClass) {
- default: // FIXME: fallthrough??
- case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
- case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
- };
-}
-
static
MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
@@ -1064,10 +1057,34 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return 2;
}
+bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+
+ switch(MI->getOpcode()) {
+ default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+ case AMDGPU::R600_EXTRACT_ELT_V2:
+ case AMDGPU::R600_EXTRACT_ELT_V4:
+ buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
+ RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address
+ MI->getOperand(2).getReg(),
+ RI.getHWRegChan(MI->getOperand(1).getReg()));
+ break;
+ case AMDGPU::R600_INSERT_ELT_V2:
+ case AMDGPU::R600_INSERT_ELT_V4:
+ buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
+ RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address
+ MI->getOperand(3).getReg(), // Offset
+ RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel
+ break;
+ }
+ MI->eraseFromParent();
+ return true;
+}
+
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const {
const AMDGPUFrameLowering *TFL =
- static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
+ static_cast<const AMDGPUFrameLowering*>(
+ MF.getTarget().getFrameLowering());
unsigned StackWidth = TFL->getStackWidth(MF);
int End = getIndirectIndexEnd(MF);
@@ -1100,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned ValueReg, unsigned Address,
unsigned OffsetReg) const {
- unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+ return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const {
+ unsigned AddrReg;
+ switch (AddrChan) {
+ default: llvm_unreachable("Invalid Channel");
+ case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ }
MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
AMDGPU::AR_X, OffsetReg);
setImmOperand(MOVA, AMDGPU::OpName::write, 0);
@@ -1117,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned ValueReg, unsigned Address,
unsigned OffsetReg) const {
- unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+ return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const {
+ unsigned AddrReg;
+ switch (AddrChan) {
+ default: llvm_unreachable("Invalid Channel");
+ case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+ case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+ case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+ case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+ }
MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
AMDGPU::AR_X,
OffsetReg);
@@ -1220,7 +1267,6 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
const {
assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
unsigned Opcode;
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::R700)
Opcode = AMDGPU::DOT4_r600;
else
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index b5304a0..45a57d3 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -32,12 +32,22 @@ namespace llvm {
class R600InstrInfo : public AMDGPUInstrInfo {
private:
const R600RegisterInfo RI;
- const AMDGPUSubtarget &ST;
- int getBranchInstr(const MachineOperand &op) const;
std::vector<std::pair<int, unsigned> >
ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
+
+ MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const;
+
+ MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ unsigned ValueReg, unsigned Address,
+ unsigned OffsetReg,
+ unsigned AddrChan) const;
public:
enum BankSwizzle {
ALU_VEC_012_SCL_210 = 0,
@@ -48,7 +58,7 @@ namespace llvm {
ALU_VEC_210
};
- explicit R600InstrInfo(AMDGPUTargetMachine &tm);
+ explicit R600InstrInfo(const AMDGPUSubtarget &st);
const R600RegisterInfo &getRegisterInfo() const override;
void copyPhysReg(MachineBasicBlock &MBB,
@@ -197,6 +207,8 @@ namespace llvm {
int getInstrLatency(const InstrItineraryData *ItinData,
SDNode *Node) const override { return 1;}
+ virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
/// \brief Reserve the registers that may be accesed using indirect addressing.
void reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 590fde2..73fa345 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -125,7 +125,7 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
InstrItinClass itin = AnyALU> :
R600_1OP <inst, opName,
- [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
+ [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
>;
// If you add or change the operands for R600_2OP instructions, you must
@@ -161,10 +161,10 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
}
class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
- InstrItinClass itim = AnyALU> :
+ InstrItinClass itin = AnyALU> :
R600_2OP <inst, opName,
[(set R600_Reg32:$dst, (node R600_Reg32:$src0,
- R600_Reg32:$src1))]
+ R600_Reg32:$src1))], itin
>;
// If you add our change the operands for R600_3OP instructions, you must
@@ -721,14 +721,11 @@ def SETNE_DX10 : R600_2OP <
>;
def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
-def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
-// Add also ftrunc intrinsic pattern
-def : Pat<(ftrunc f32:$src0), (TRUNC $src0)>;
-
def MOV : R600_1OP <0x19, "MOV", []>;
let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
@@ -1082,18 +1079,21 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
let Itinerary = TransALU;
}
+// Clamped to maximum.
class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
- inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
+ inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
> {
let Itinerary = TransALU;
}
-class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
- inst, "RECIPSQRT_IEEE", []
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
> {
let Itinerary = TransALU;
}
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
class SIN_Common <bits<11> inst> : R600_1OP <
inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
let Trig = 1;
@@ -1266,13 +1266,6 @@ defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
//===----------------------------------------------------------------------===//
-// Branch Instructions
-//===----------------------------------------------------------------------===//
-
-def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src),
- "IF_PREDICATE_SET $src", []>;
-
-//===----------------------------------------------------------------------===//
// Pseudo instructions
//===----------------------------------------------------------------------===//
@@ -1345,15 +1338,6 @@ def TXD_SHADOW: InstR600 <
} // End isPseudo = 1
} // End usesCustomInserter = 1
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
- usesCustomInserter = 1 in {
- def RETURN : ILFormat<(outs), (ins variable_ops),
- "RETURN", [(IL_retflag)]>;
-}
-
//===----------------------------------------------------------------------===//
// Constant Buffer Addressing Support
@@ -1480,11 +1464,52 @@ let Inst{63-32} = Word1;
let VTXInst = 1;
}
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+ let Namespace = "AMDGPU";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let Pattern = pattern;
+ let AsmString = !strconcat(asmstr, "\n");
+ let isPseudo = 1;
+ let Itinerary = NullALU;
+ bit hasIEEEFlag = 0;
+ bit hasZeroOpFlag = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+ def _i32 : ILFormat<(outs),
+ (ins brtarget:$target, rci:$src0),
+ "; i32 Pseudo branch instruction",
+ [(Op bb:$target, (i32 rci:$src0))]>;
+ def _f32 : ILFormat<(outs),
+ (ins brtarget:$target, rcf:$src0),
+ "; f32 Pseudo branch instruction",
+ [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+ def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+ !strconcat(name, " $src"), []>;
+ def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+ !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+ def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+ !strconcat(name, " $src0, $src1"), []>;
+ def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+ !strconcat(name, " $src0, $src1"), []>;
+}
-
-//===--------------------------------------------------------------------===//
-// Instructions support
-//===--------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
// Custom Inserter for Branches and returns, this eventually will be a
// separate pass
@@ -1497,13 +1522,22 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
}
//===---------------------------------------------------------------------===//
-// Flow and Program control Instructions
+// Return instruction
//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+ usesCustomInserter = 1 in {
+ def RETURN : ILFormat<(outs), (ins variable_ops),
+ "RETURN", [(IL_retflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src),
+ "IF_PREDICATE_SET $src", []>;
+
let isTerminator=1 in {
- def SWITCH : ILFormat< (outs), (ins GPRI32:$src),
- !strconcat("SWITCH", " $src"), []>;
- def CASE : ILFormat< (outs), (ins GPRI32:$src),
- !strconcat("CASE", " $src"), []>;
def BREAK : ILFormat< (outs), (ins),
"BREAK", []>;
def CONTINUE : ILFormat< (outs), (ins),
@@ -1548,6 +1582,60 @@ let isTerminator=1 in {
}
//===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+ (outs R600_Reg32:$dst),
+ (ins vec_rc:$vec, R600_Reg32:$index), "",
+ [],
+ AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+ (outs vec_rc:$dst),
+ (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+ [],
+ AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+ ValueType scalar_ty> : Pat <
+ (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+ (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+ ValueType scalar_ty> : Pat <
+ (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+ (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
+//===----------------------------------------------------------------------===//
// ISel Patterns
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d1655d1..7ea654c 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "R600MachineScheduler.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Pass.h"
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index c2f6c03..74cf309 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -16,6 +16,7 @@
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineDominators.h"
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index f3bb88b..dc95675 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -20,15 +20,14 @@
using namespace llvm;
-R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
-: AMDGPURegisterInfo(tm),
- TM(tm)
+R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
{ RCW.RegWeight = 0; RCW.WeightLimit = 0;}
BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
- const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+ const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo());
Reserved.set(AMDGPU::ZERO);
Reserved.set(AMDGPU::HALF);
@@ -55,16 +54,6 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
-const TargetRegisterClass *
-R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
- switch (rc->getID()) {
- case AMDGPU::GPRF32RegClassID:
- case AMDGPU::GPRI32RegClassID:
- return &AMDGPU::R600_Reg32RegClass;
- default: return rc;
- }
-}
-
unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 52e1a4b..247808b 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -16,26 +16,18 @@
#define R600REGISTERINFO_H_
#include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
namespace llvm {
-class R600TargetMachine;
+class AMDGPUSubtarget;
struct R600RegisterInfo : public AMDGPURegisterInfo {
- AMDGPUTargetMachine &TM;
RegClassWeight RCW;
- R600RegisterInfo(AMDGPUTargetMachine &tm);
+ R600RegisterInfo(const AMDGPUSubtarget &st);
BitVector getReservedRegs(const MachineFunction &MF) const override;
- /// \param RC is an AMDIL reg class.
- ///
- /// \returns the R600 reg class that is equivalent to \p RC.
- const TargetRegisterClass *getISARegClass(
- const TargetRegisterClass *RC) const override;
-
/// \brief get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 68bcd20..cc667d9 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
RegisterWithSubRegs<n, subregs> {
+ field bits<2> chan_encoding = 0;
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1, sub2, sub3];
- let HWEncoding = encoding;
+ let HWEncoding{8-0} = encoding{8-0};
+ let HWEncoding{10-9} = chan_encoding;
}
class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
RegisterWithSubRegs<n, subregs> {
+ field bits<2> chan_encoding = 0;
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = encoding;
+ let HWEncoding{8-0} = encoding{8-0};
+ let HWEncoding{10-9} = chan_encoding;
}
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+ "V"#lo#hi#"_"#chan,
+ [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+ lo
+>;
foreach Index = 0-127 in {
foreach Chan = [ "X", "Y", "Z", "W" ] in {
@@ -54,6 +64,24 @@ foreach Index = 0-127 in {
Index>;
}
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+ let chan_encoding = !if(!eq(Chan, "X"), 0,
+ !if(!eq(Chan, "Y"), 1,
+ !if(!eq(Chan, "Z"), 2,
+ !if(!eq(Chan, "W"), 3, 0)))) in {
+ def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+ [!cast<Register>("T0_"#Chan),
+ !cast<Register>("T1_"#Chan),
+ !cast<Register>("T2_"#Chan),
+ !cast<Register>("T3_"#Chan)],
+ 0>;
+ def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+ def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+ }
+}
+
+
// KCACHE_BANK0
foreach Index = 159-128 in {
foreach Chan = [ "X", "Y", "Z", "W" ] in {
@@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>;
let isAllocatable = 0 in {
-// XXX: Only use the X channel, until we support wider stack widths
-def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
(add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
@@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
let CopyCost = -1;
}
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+ (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
(add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+ (add V01_X, V01_Y, V01_Z, V01_W,
+ V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index d6e4451..91eb60b 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -65,7 +65,6 @@ class SIAnnotateControlFlow : public FunctionPass {
DominatorTree *DT;
StackVector Stack;
- SSAUpdater PhiInserter;
bool isTopOfStack(BasicBlock *BB);
@@ -81,7 +80,7 @@ class SIAnnotateControlFlow : public FunctionPass {
void insertElse(BranchInst *Term);
- void handleLoopCondition(Value *Cond);
+ Value *handleLoopCondition(Value *Cond, PHINode *Broken);
void handleLoop(BranchInst *Term);
@@ -177,7 +176,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
} else {
if (Phi->getIncomingValue(i) != BoolFalse)
return false;
-
+
}
}
return true;
@@ -204,20 +203,26 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
}
/// \brief Recursively handle the condition leading to a loop
-void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) {
if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
+ BasicBlock *Parent = Phi->getParent();
+ PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+ Value *Ret = NewPhi;
// Handle all non-constant incoming values first
for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
Value *Incoming = Phi->getIncomingValue(i);
- if (isa<ConstantInt>(Incoming))
+ BasicBlock *From = Phi->getIncomingBlock(i);
+ if (isa<ConstantInt>(Incoming)) {
+ NewPhi->addIncoming(Broken, From);
continue;
+ }
Phi->setIncomingValue(i, BoolFalse);
- handleLoopCondition(Incoming);
+ Value *PhiArg = handleLoopCondition(Incoming, Broken);
+ NewPhi->addIncoming(PhiArg, From);
}
- BasicBlock *Parent = Phi->getParent();
BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
@@ -230,33 +235,28 @@ void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
if (From == IDom) {
CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
- Value *Args[] = {
- OldEnd->getArgOperand(0),
- PhiInserter.GetValueAtEndOfBlock(Parent)
- };
- Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
- PhiInserter.AddAvailableValue(Parent, Ret);
+ Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
+ Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
continue;
}
}
-
TerminatorInst *Insert = From->getTerminator();
- Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
- Value *Ret = CallInst::Create(Break, Arg, "", Insert);
- PhiInserter.AddAvailableValue(From, Ret);
+ Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
+ NewPhi->setIncomingValue(i, PhiArg);
}
eraseIfUnused(Phi);
+ return Ret;
} else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
BasicBlock *Parent = Inst->getParent();
TerminatorInst *Insert = Parent->getTerminator();
- Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
- Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
- PhiInserter.AddAvailableValue(Parent, Ret);
+ Value *Args[] = { Cond, Broken };
+ return CallInst::Create(IfBreak, Args, "", Insert);
} else {
llvm_unreachable("Unhandled loop condition!");
}
+ return 0;
}
/// \brief Handle a back edge (loop)
@@ -264,15 +264,11 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
BasicBlock *Target = Term->getSuccessor(1);
PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
- PhiInserter.Initialize(Int64, "");
- PhiInserter.AddAvailableValue(Target, Broken);
-
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
- handleLoopCondition(Cond);
+ Value *Arg = handleLoopCondition(Cond, Broken);
BasicBlock *BB = Term->getParent();
- Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
PI != PE; ++PI) {
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 2cbce28..4d31a11 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -35,4 +35,54 @@ enum {
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
+
+#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
+#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0)
+#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F)
+#define C_00B848_VGPRS 0xFFFFFFC0
+#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6)
+#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F)
+#define C_00B848_SGPRS 0xFFFFFC3F
+#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10)
+#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03)
+#define C_00B848_PRIORITY 0xFFFFF3FF
+#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12)
+#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF)
+#define C_00B848_FLOAT_MODE 0xFFF00FFF
+#define S_00B848_PRIV(x) (((x) & 0x1) << 20)
+#define G_00B848_PRIV(x) (((x) >> 20) & 0x1)
+#define C_00B848_PRIV 0xFFEFFFFF
+#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21)
+#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1)
+#define C_00B848_DX10_CLAMP 0xFFDFFFFF
+#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22)
+#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1)
+#define C_00B848_DEBUG_MODE 0xFFBFFFFF
+#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23)
+#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1)
+#define C_00B848_IEEE_MODE 0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
#endif // SIDEFINES_H_
diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
new file mode 100644
index 0000000..7d116ee
--- /dev/null
+++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
@@ -0,0 +1,110 @@
+//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// SALU instructions ignore control flow, so we need to modify the live ranges
+/// of the registers they define.
+///
+/// The strategy is to view the entire program as if it were a single basic
+/// block and calculate the intervals accordingly. We implement this
+/// by walking this list of segments for each LiveRange and setting the
+/// end of each segment equal to the start of the segment that immediately
+/// follows it.
+
+#include "AMDGPU.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
+
+namespace {
+
+class SIFixSGPRLiveRanges : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
+ initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+ virtual const char *getPassName() const override {
+ return "SI Fix SGPR live ranges";
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
+ "SI Fix SGPR Live Ranges", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
+ "SI Fix SGPR Live Ranges", false, false)
+
+char SIFixSGPRLiveRanges::ID = 0;
+
+char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
+
+FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
+ return new SIFixSGPRLiveRanges();
+}
+
+bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+ MF.getTarget().getRegisterInfo());
+ LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC);
+ if (ExecUse)
+ continue;
+
+ for (const MachineOperand &Def : MI.operands()) {
+ if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg());
+
+ if (!TRI->isSGPRClass(RC))
+ continue;
+ LiveInterval &LI = LIS->getInterval(Def.getReg());
+ for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) {
+ LiveRange::Segment &Seg = LI.segments[i];
+ LiveRange::Segment &Next = LI.segments[i + 1];
+ Seg.end = Next.start;
+ }
+ }
+ }
+ }
+
+ return false;
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index c9e247c..b13c3b8 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,8 +14,8 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/Function.h"
+#include "llvm/ADT/SmallString.h"
using namespace llvm;
@@ -76,6 +77,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::ADD, MVT::i32, Legal);
setOperationAction(ISD::ADDC, MVT::i32, Legal);
setOperationAction(ISD::ADDE, MVT::i32, Legal);
+ setOperationAction(ISD::SUBC, MVT::i32, Legal);
+ setOperationAction(ISD::SUBE, MVT::i32, Legal);
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -88,14 +91,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
// We need to custom lower loads/stores from private memory
setOperationAction(ISD::LOAD, MVT::i32, Custom);
- setOperationAction(ISD::LOAD, MVT::i64, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
- setOperationAction(ISD::STORE, MVT::i64, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
@@ -105,18 +106,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-
- setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -139,6 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
@@ -215,9 +213,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FRINT, MVT::f64, Legal);
}
+ // FIXME: These should be removed and handled the same was as f32 fneg. Source
+ // modifiers also work for the double instructions.
+ setOperationAction(ISD::FNEG, MVT::f64, Expand);
+ setOperationAction(ISD::FABS, MVT::f64, Expand);
+
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
+
setSchedulingPreference(Sched::RegPressure);
}
@@ -265,8 +270,12 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
return VT.bitsGT(MVT::i32);
}
-bool SITargetLowering::shouldSplitVectorType(EVT VT) const {
- return VT.getScalarType().bitsLE(MVT::i16);
+TargetLoweringBase::LegalizeTypeAction
+SITargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ return TypeSplitVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
}
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -482,19 +491,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MI->eraseFromParent();
break;
}
- case AMDGPU::V_SUB_F64:
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
- MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg())
- .addReg(MI->getOperand(2).getReg())
- .addImm(0) /* src2 */
- .addImm(0) /* ABS */
- .addImm(0) /* CLAMP */
- .addImm(0) /* OMOD */
- .addImm(2); /* NEG */
+ case AMDGPU::V_SUB_F64: {
+ unsigned DestReg = MI->getOperand(0).getReg();
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
+ .addImm(0) // SRC0 modifiers
+ .addReg(MI->getOperand(1).getReg())
+ .addImm(1) // SRC1 modifiers
+ .addReg(MI->getOperand(2).getReg())
+ .addImm(0) // SRC2 modifiers
+ .addImm(0) // src2
+ .addImm(0) // CLAMP
+ .addImm(0); // OMOD
MI->eraseFromParent();
break;
-
+ }
case AMDGPU::SI_RegisterStorePseudo: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -595,27 +605,31 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+ EVT VT = Op.getValueType();
+
+ // These loads are legal.
+ if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ VT.isVector() && VT.getVectorNumElements() == 2 &&
+ VT.getVectorElementType() == MVT::i32)
+ return SDValue();
+
if (Op.getValueType().isVector() &&
(Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
(Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
Op.getValueType().getVectorNumElements() > 4))) {
- SDValue MergedValues[2] = {
- SplitVectorLoad(Op, DAG),
- Load->getChain()
- };
- return DAG.getMergeValues(MergedValues, SDLoc(Op));
+ return SplitVectorLoad(Op, DAG);
} else {
- return LowerLOAD(Op, DAG);
+ SDValue Result = LowerLOAD(Op, DAG);
+ assert((!Result.getNode() ||
+ Result.getNode()->getNumValues() == 2) &&
+ "Load should return a value and a chain");
+ return Result;
}
}
case ISD::SELECT: return LowerSELECT(Op, DAG);
- case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
- case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
- case ISD::ANY_EXTEND: // Fall-through
- case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntrinsicID =
@@ -827,13 +841,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
- SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
- SDValue MergedValues[2];
- MergedValues[1] = Load->getChain();
- if (Ret.getNode()) {
- MergedValues[0] = Ret;
- return DAG.getMergeValues(MergedValues, DL);
- }
+ SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ if (Lowered.getNode())
+ return Lowered;
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
@@ -846,25 +856,38 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
DAG.getConstant(2, MVT::i32));
- Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Load->getChain(), Ptr,
- DAG.getTargetConstant(0, MVT::i32),
- Op.getOperand(2));
+
+ // FIXME: REGISTER_LOAD should probably have a chain result.
+ SDValue Chain = Load->getChain();
+ SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Ptr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
+
+ SDValue Ret = LoLoad.getValue(0);
if (MemVT.getSizeInBits() == 64) {
+ // TODO: This needs a test to make sure the right thing is happening with
+ // the chain. That is hard without general function support.
+
SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
DAG.getConstant(1, MVT::i32));
- SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Load->getChain(), IncPtr,
- DAG.getTargetConstant(0, MVT::i32),
- Op.getOperand(2));
+ SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, IncPtr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
- Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
+ Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
+ // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ // LoLoad.getValue(1), HiLoad.getValue(1));
}
- MergedValues[0] = Ret;
- return DAG.getMergeValues(MergedValues, DL);
+ SDValue Ops[] = {
+ Ret,
+ Chain
+ };
+ return DAG.getMergeValues(Ops, DL);
}
SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
@@ -903,39 +926,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
}
-SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue True = Op.getOperand(2);
- SDValue False = Op.getOperand(3);
- SDValue CC = Op.getOperand(4);
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
-
- SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
- return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
-}
-
-SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
-
- if (VT != MVT::i64) {
- return SDValue();
- }
-
- SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
- DAG.getConstant(31, MVT::i32));
-
- return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
-}
-
SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
StoreSDNode *Store = cast<StoreSDNode>(Op);
EVT VT = Store->getMemoryVT();
+ // These stores are legal.
+ if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ VT.isVector() && VT.getVectorNumElements() == 2 &&
+ VT.getVectorElementType() == MVT::i32)
+ return SDValue();
+
SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
if (Ret.getNode())
return Ret;
@@ -1011,27 +1012,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return Chain;
}
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ EVT ScalarVT = VT.getScalarType();
+ if (ScalarVT != MVT::f32)
+ return SDValue();
-SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
- if (VT != MVT::i64) {
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // TODO: We could try to match extracting the higher bytes, which would be
+ // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+ // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+ // about in practice.
+ if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+ if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+ SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+ DCI.AddToWorklist(Cvt.getNode());
+ return Cvt;
+ }
+ }
+
+ // We are primarily trying to catch operations on illegal vector types
+ // before they are expanded.
+ // For scalars, we can use the more flexible method of checking masked bits
+ // after legalization.
+ if (!DCI.isBeforeLegalize() ||
+ !SrcVT.isVector() ||
+ SrcVT.getVectorElementType() != MVT::i8) {
return SDValue();
}
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() != MVT::i32)
- Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+ assert(DCI.isBeforeLegalize() && "Unexpected legal type");
- SDValue Zero = DAG.getConstant(0, MVT::i32);
- return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
-}
+ // Weird sized vectors are a pain to handle, but we know 3 is really the same
+ // size as 4.
+ unsigned NElts = SrcVT.getVectorNumElements();
+ if (!SrcVT.isSimple() && NElts != 3)
+ return SDValue();
-//===----------------------------------------------------------------------===//
-// Custom DAG optimizations
-//===----------------------------------------------------------------------===//
+ // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
+ // prevent a mess from expanding to v4i32 and repacking.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
+ EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
+ EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
+
+ LoadSDNode *Load = cast<LoadSDNode>(Src);
+ SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
+ Load->getChain(),
+ Load->getBasePtr(),
+ LoadVT,
+ Load->getMemOperand());
+
+ // Make sure successors of the original load stay after it by updating
+ // them to use the new Chain.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
+
+ SmallVector<SDValue, 4> Elts;
+ if (RegVT.isVector())
+ DAG.ExtractVectorElements(NewLoad, Elts);
+ else
+ Elts.push_back(NewLoad);
+
+ SmallVector<SDValue, 4> Ops;
+
+ unsigned EltIdx = 0;
+ for (SDValue Elt : Elts) {
+ unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
+ for (unsigned I = 0; I < ComponentsInElt; ++I) {
+ unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
+ SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
+ DCI.AddToWorklist(Cvt.getNode());
+ Ops.push_back(Cvt);
+ }
+
+ ++EltIdx;
+ }
+
+ assert(Ops.size() == NElts);
+
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
+ }
+
+ return SDValue();
+}
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -1074,6 +1147,31 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
+
+ case AMDGPUISD::CVT_F32_UBYTE0:
+ case AMDGPUISD::CVT_F32_UBYTE1:
+ case AMDGPUISD::CVT_F32_UBYTE2:
+ case AMDGPUISD::CVT_F32_UBYTE3: {
+ unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+ SDValue Src = N->getOperand(0);
+ APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+ TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+
+ break;
+ }
+
+ case ISD::UINT_TO_FP: {
+ return performUCharToFloatCombine(N, DCI);
+ }
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -1297,7 +1395,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
bool HaveVSrc = false, HaveSSrc = false;
- // First figure out what we alread have in this instruction
+ // First figure out what we already have in this instruction.
for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
i != e && Op < NumOps; ++i, ++Op) {
@@ -1316,7 +1414,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
}
}
- // If we neither have VSrc nor SSrc it makes no sense to continue
+ // If we neither have VSrc nor SSrc, it makes no sense to continue.
if (!HaveVSrc && !HaveSSrc)
return Node;
@@ -1332,17 +1430,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
const SDValue &Operand = Node->getOperand(i);
Ops.push_back(Operand);
- // Already folded immediate ?
+ // Already folded immediate?
if (isa<ConstantSDNode>(Operand.getNode()) ||
isa<ConstantFPSDNode>(Operand.getNode()))
continue;
- // Is this a VSrc or SSrc operand ?
+ // Is this a VSrc or SSrc operand?
unsigned RegClass = Desc->OpInfo[Op].RegClass;
if (isVSrc(RegClass) || isSSrc(RegClass)) {
// Try to fold the immediates
if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
- // Folding didn't worked, make sure we don't hit the SReg limit
+ // Folding didn't work, make sure we don't hit the SReg limit.
ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
}
continue;
@@ -1371,7 +1469,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
continue;
if (DescE64) {
-
// Test if it makes sense to switch to e64 encoding
unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
@@ -1402,7 +1499,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
if (!DescE64)
continue;
Desc = DescE64;
- DescE64 = 0;
+ DescE64 = nullptr;
}
else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
Ops.pop_back();
@@ -1412,7 +1509,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
if (!DescE64)
continue;
Desc = DescE64;
- DescE64 = 0;
+ DescE64 = nullptr;
}
}
@@ -1535,7 +1632,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
}
}
-/// \brief Fold the instructions after slecting them
+/// \brief Fold the instructions after selecting them.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
const SIInstrInfo *TII =
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index c6eaa81..e25323a 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -27,10 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
bool foldImm(SDValue &Operand, int32_t &Immediate,
@@ -46,11 +43,16 @@ class SITargetLowering : public AMDGPUTargetLowering {
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
+ static SDValue performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI);
+
public:
SITargetLowering(TargetMachine &tm);
bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
bool *IsFast) const override;
- bool shouldSplitVectorType(EVT VT) const override;
+
+ TargetLoweringBase::LegalizeTypeAction
+ getPreferredVectorAction(EVT VT) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index a17fed7..1733326 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
return Result;
}
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
bool Changes = false;
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 168eff2..7cae9fc 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -51,6 +51,16 @@ class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
let Size = 8;
}
+class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
+ Enc64 <outs, ins, asm, pattern> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let VOP3 = 1;
+}
+
//===----------------------------------------------------------------------===//
// Scalar operations
//===----------------------------------------------------------------------===//
@@ -207,7 +217,7 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
}
class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
- Enc64 <outs, ins, asm, pattern> {
+ VOP3Common <outs, ins, asm, pattern> {
bits<8> dst;
bits<2> src0_modifiers;
@@ -233,16 +243,11 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{61} = src0_modifiers{0};
let Inst{62} = src1_modifiers{0};
let Inst{63} = src2_modifiers{0};
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOP3 = 1;
+
}
class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
- Enc64 <outs, ins, asm, pattern> {
+ VOP3Common <outs, ins, asm, pattern> {
bits<8> dst;
bits<2> src0_modifiers;
@@ -266,11 +271,6 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Inst{62} = src1_modifiers{0};
let Inst{63} = src2_modifiers{0};
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOP3 = 1;
}
class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 4a9e346..455c890 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -19,13 +19,14 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCInstrDesc.h"
using namespace llvm;
-SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
- : AMDGPUInstrInfo(tm),
- RI(tm) { }
+SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
+ : AMDGPUInstrInfo(st),
+ RI(st) { }
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks
@@ -187,18 +188,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned KillFlag = isKill ? RegState::Kill : 0;
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
- unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+ if (RI.hasVGPRs(RC)) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
+ .addReg(SrcReg);
+ } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
+ unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF);
+ unsigned TgtReg = MFI->SpillTracker.LaneVGPR;
- BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg)
.addReg(SrcReg, KillFlag)
.addImm(Lane);
- MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+ MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane);
} else if (RI.isSGPRClass(RC)) {
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for vector
@@ -207,8 +215,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// Reserve a spot in the spill tracker for each sub-register of
// the vector register.
unsigned NumSubRegs = RC->getSize() / 4;
- unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
- NumSubRegs);
+ unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs);
MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
FirstLane);
@@ -234,19 +241,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
DebugLoc DL = MBB.findDebugLoc(MI);
- if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
- SIMachineFunctionInfo::SpilledReg Spill =
- MFI->SpillTracker.getSpilledReg(FrameIndex);
- assert(Spill.VGPR);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
- .addReg(Spill.VGPR)
- .addImm(Spill.Lane);
- insertNOPs(MI, 3);
+
+ if (RI.hasVGPRs(RC)) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ .addImm(0);
} else if (RI.isSGPRClass(RC)){
unsigned Opcode;
switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
@@ -260,7 +267,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addReg(Spill.VGPR)
.addImm(FrameIndex);
- insertNOPs(MI, 3);
} else {
llvm_unreachable("VGPR spilling not supported");
}
@@ -281,6 +287,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
return 2;
+ case AMDGPU::SI_SPILL_S32_RESTORE:
+ return 1;
default: llvm_unreachable("Invalid spill opcode");
}
}
@@ -334,7 +342,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
- case AMDGPU::SI_SPILL_S64_RESTORE: {
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_S32_RESTORE: {
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
@@ -348,6 +357,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
.addReg(MI->getOperand(1).getReg())
.addImm(Spill.Lane + i);
}
+ insertNOPs(MI, 3);
MI->eraseFromParent();
break;
}
@@ -514,6 +524,23 @@ bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
}
+static bool compareMachineOp(const MachineOperand &Op0,
+ const MachineOperand &Op1) {
+ if (Op0.getType() != Op1.getType())
+ return false;
+
+ switch (Op0.getType()) {
+ case MachineOperand::MO_Register:
+ return Op0.getReg() == Op1.getReg();
+ case MachineOperand::MO_Immediate:
+ return Op0.getImm() == Op1.getImm();
+ case MachineOperand::MO_FPImmediate:
+ return Op0.getFPImm() == Op1.getFPImm();
+ default:
+ llvm_unreachable("Didn't expect to be comparing these operand types");
+ }
+}
+
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI->getOpcode();
@@ -532,7 +559,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Make sure the register classes are correct
for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) {
switch (Desc.OpInfo[i].OperandType) {
- case MCOI::OPERAND_REGISTER:
+ case MCOI::OPERAND_REGISTER: {
+ int RegClass = Desc.OpInfo[i].RegClass;
+ if (!RI.regClassCanUseImmediate(RegClass) &&
+ (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) {
+ ErrInfo = "Expected register, but got immediate";
+ return false;
+ }
+ }
break;
case MCOI::OPERAND_IMMEDIATE:
if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
@@ -620,6 +654,24 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
return false;
}
}
+
+ // Verify misc. restrictions on specific instructions.
+ if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
+ Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+ MI->dump();
+
+ const MachineOperand &Src0 = MI->getOperand(2);
+ const MachineOperand &Src1 = MI->getOperand(3);
+ const MachineOperand &Src2 = MI->getOperand(4);
+ if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
+ if (!compareMachineOp(Src0, Src1) &&
+ !compareMachineOp(Src0, Src2)) {
+ ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -654,7 +706,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+ case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
@@ -667,6 +721,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+ case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+ case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+ case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
}
}
@@ -731,8 +788,8 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
unsigned SubReg = MRI.createVirtualRegister(SubRC);
// Just in case the super register is itself a sub-register, copy it to a new
- // value so we don't need to wory about merging its subreg index with the
- // SubIdx passed to this function. The register coalescer should be able to
+ // value so we don't need to worry about merging its subreg index with the
+ // SubIdx passed to this function. The register coalescer should be able to
// eliminate this extra copy.
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
NewSuperReg)
@@ -1157,22 +1214,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
continue;
}
case AMDGPU::S_AND_B64:
- splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
Inst->eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
Inst->eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
Inst->eraseFromParent();
continue;
case AMDGPU::S_NOT_B64:
- splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+ Inst->eraseFromParent();
+ continue;
+
+ case AMDGPU::S_BCNT1_I32_B64:
+ splitScalar64BitBCNT(Worklist, Inst);
Inst->eraseFromParent();
continue;
@@ -1217,6 +1279,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// 3 to not hit an assertion later in MCInstLower.
Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(0));
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ Inst->addOperand(MachineOperand::CreateImm(0));
}
addDescImplicitUseDef(NewDesc, Inst);
@@ -1297,9 +1363,62 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
return &AMDGPU::VReg_32RegClass;
}
-void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
- MachineInstr *Inst,
- unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitUnaryOp(
+ SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst->getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineOperand &Dest = Inst->getOperand(0);
+ MachineOperand &Src0 = Inst->getOperand(1);
+ DebugLoc DL = Inst->getDebugLoc();
+
+ MachineBasicBlock::iterator MII = Inst;
+
+ const MCInstrDesc &InstDesc = get(Opcode);
+ const TargetRegisterClass *Src0RC = Src0.isReg() ?
+ MRI.getRegClass(Src0.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+ MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub0, Src0SubRC);
+
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+ unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+ MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+ .addOperand(SrcReg0Sub0);
+
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+
+ unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+ MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+ .addOperand(SrcReg0Sub1);
+
+ unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+ // Try to legalize the operands in case we need to swap the order to keep it
+ // valid.
+ Worklist.push_back(LoHalf);
+ Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+ SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst,
+ unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst->getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -1360,6 +1479,46 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
Worklist.push_back(HiHalf);
}
+void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const {
+ MachineBasicBlock &MBB = *Inst->getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst->getDebugLoc();
+
+ MachineOperand &Dest = Inst->getOperand(0);
+ MachineOperand &Src = Inst->getOperand(1);
+
+ const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+ const TargetRegisterClass *SrcRC = Src.isReg() ?
+ MRI.getRegClass(Src.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub0, SrcSubRC);
+ MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub1, SrcSubRC);
+
+ MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+ .addOperand(SrcRegSub0)
+ .addImm(0);
+
+ MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+ .addOperand(SrcRegSub1)
+ .addReg(MidReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+ Worklist.push_back(First);
+ Worklist.push_back(Second);
+}
+
void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
MachineInstr *Inst) const {
// Add the implict and explicit register definitions.
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 7b31a81..4c204d8 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -44,13 +44,19 @@ private:
const TargetRegisterClass *RC,
const MachineOperand &Op) const;
- void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
- MachineInstr *Inst, unsigned Opcode) const;
+ void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst, unsigned Opcode) const;
+
+ void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst, unsigned Opcode) const;
+
+ void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const;
void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
public:
- explicit SIInstrInfo(AMDGPUTargetMachine &tm);
+ explicit SIInstrInfo(const AMDGPUSubtarget &st);
const SIRegisterInfo &getRegisterInfo() const override {
return RI;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 2242e6d..774c9d1 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -147,6 +147,12 @@ def FRAMEri32 : Operand<iPTR> {
}
//===----------------------------------------------------------------------===//
+// Complex patterns
+//===----------------------------------------------------------------------===//
+
+def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
+
+//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
@@ -187,6 +193,12 @@ class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
opName#" $dst, $src0", pattern
>;
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+ op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern
+>;
+
class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
opName#" $dst, $src0, $src1", pattern
@@ -260,7 +272,7 @@ class SIMCInstr <string pseudo, int subtarget> {
multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
string opName> {
- def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+ def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>,
SIMCInstr<OpName, SISubtarget.NONE> {
let isPseudo = 1;
}
@@ -357,12 +369,13 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
}
multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
- string opName, ValueType vt, PatLeaf cond> {
-
+ string opName, ValueType vt, PatLeaf cond, bit defExec = 0> {
def _e32 : VOPC <
op, (ins arc:$src0, vrc:$src1),
opName#"_e32 $dst, $src0, $src1", []
- >, VOP <opName>;
+ >, VOP <opName> {
+ let Defs = !if(defExec, [VCC, EXEC], [VCC]);
+ }
def _e64 : VOP3 <
{0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
@@ -375,6 +388,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
[(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
)
>, VOP <opName> {
+ let Defs = !if(defExec, [EXEC], []);
let src2 = SIOperand.ZERO;
let src2_modifiers = 0;
}
@@ -388,6 +402,14 @@ multiclass VOPC_64 <bits<8> op, string opName,
ValueType vt = untyped, PatLeaf cond = COND_NULL>
: VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
+multiclass VOPCX_32 <bits<8> op, string opName,
+ ValueType vt = untyped, PatLeaf cond = COND_NULL>
+ : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>;
+
+multiclass VOPCX_64 <bits<8> op, string opName,
+ ValueType vt = untyped, PatLeaf cond = COND_NULL>
+ : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>;
+
multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
op, (outs VReg_32:$dst),
(ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
@@ -396,7 +418,7 @@ multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
>;
-class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_64:$dst),
(ins VSrc_64:$src0, VSrc_32:$src1),
opName#" $dst, $src0, $src1", pattern
@@ -410,11 +432,29 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_64:$dst),
- (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
+ (ins InputMods:$src0_modifiers, VSrc_64:$src0,
+ InputMods:$src1_modifiers, VSrc_64:$src1,
+ InputMods:$src2_modifiers, VSrc_64:$src2,
+ InstFlag:$clamp, InstFlag:$omod),
+ opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
+>, VOP <opName>;
+
+
+class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc,
+ string opName, list<dag> pattern> : VOP3 <
+ op, (outs vrc:$dst0, SReg_64:$dst1),
+ (ins arc:$src0, arc:$src1, arc:$src2,
InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
- opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+ opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
>, VOP <opName>;
+
+class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> :
+ VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> :
+ VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
//===----------------------------------------------------------------------===//
// Vector I/O classes
//===----------------------------------------------------------------------===//
@@ -475,10 +515,11 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
let vdst = 0;
}
+// 1 address, 1 data.
class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
op,
(outs rc:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
+ (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
asm#" $vdst, $addr, $data0, $offset, [M0]",
[]> {
@@ -487,6 +528,41 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
let mayLoad = 1;
}
+// 1 address, 2 data.
+class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+ op,
+ (outs rc:$vdst),
+ (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+ asm#" $vdst, $addr, $data0, $data1, $offset, [M0]",
+ []> {
+ let mayStore = 1;
+ let mayLoad = 1;
+}
+
+// 1 address, 2 data.
+class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+ op,
+ (outs),
+ (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+ asm#" $addr, $data0, $data1, $offset, [M0]",
+ []> {
+ let mayStore = 1;
+ let mayLoad = 1;
+}
+
+// 1 address, 1 data.
+class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+ op,
+ (outs),
+ (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
+ asm#" $addr, $data0, $offset, [M0]",
+ []> {
+
+ let data1 = 0;
+ let mayStore = 1;
+ let mayLoad = 1;
+}
+
class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
op,
(outs),
@@ -500,7 +576,9 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
let mayLoad = 0;
}
-multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
+multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> {
let lds = 0, mayLoad = 1 in {
@@ -542,16 +620,19 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
- asm#" $vdata, $srsrc + $vaddr + $offset", []>;
+ asm#" $vdata, $srsrc + $vaddr + $offset",
+ [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
+ i64:$vaddr, u16imm:$offset)))]>;
}
}
}
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+ ValueType store_vt, SDPatternOperator st> :
MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
u16imm:$offset),
name#" $vdata, $srsrc + $vaddr + $offset",
- []> {
+ [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
let mayLoad = 0;
let mayStore = 1;
@@ -658,6 +739,53 @@ multiclass MIMG_Sampler <bits<7> op, string asm> {
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
}
+class MIMG_Gather_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass src_rc> : MIMG <
+ op,
+ (outs dst_rc:$vdata),
+ (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+ i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+ SReg_256:$srsrc, SReg_128:$ssamp),
+ asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+ #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+ []> {
+ let mayLoad = 1;
+ let mayStore = 0;
+
+ // DMASK was repurposed for GATHER4. 4 components are always
+ // returned and DMASK works like a swizzle - it selects
+ // the component to fetch. The only useful DMASK values are
+ // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+ // (red,red,red,red) etc.) The ISA document doesn't mention
+ // this.
+ // Therefore, disable all code which updates DMASK by setting these two:
+ let MIMG = 0;
+ let hasPostISelHook = 0;
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+ RegisterClass dst_rc,
+ int channels> {
+ def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+ MIMG_Mask<asm#"_V1", channels>;
+ def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
+ MIMG_Mask<asm#"_V2", channels>;
+ def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>,
+ MIMG_Mask<asm#"_V4", channels>;
+ def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>,
+ MIMG_Mask<asm#"_V8", channels>;
+ def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>,
+ MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm> {
+ defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
+ defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
+ defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
+ defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
+}
+
//===----------------------------------------------------------------------===//
// Vector instruction mappings
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 500fa78..b3b44e2 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -96,22 +96,35 @@ def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
[(set i32:$dst, (not i32:$src0))]
>;
-def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64",
+ [(set i64:$dst, (not i64:$src0))]
+>;
def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32",
+ [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+>;
def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
} // End neverHasSideEffects = 1
////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
-////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
-////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
-////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
+def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
+ [(set i32:$dst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
+
+////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>;
////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
-////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
+def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32",
+ [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+>;
////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
-//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
+
+def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
+ [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+>;
+
//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
@@ -320,7 +333,7 @@ def S_CMPK_EQ_I32 : SOPK <
>;
*/
-let isCompare = 1 in {
+let isCompare = 1, Defs = [SCC] in {
def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
@@ -332,7 +345,7 @@ def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
-} // End isCompare = 1
+} // End isCompare = 1, Defs = [SCC]
let Defs = [SCC], isCommutable = 1 in {
def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
@@ -467,26 +480,26 @@ defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">;
-defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">;
-defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">;
-defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">;
-defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">;
-defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">;
-defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">;
-defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">;
-defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">;
-defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">;
-defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">;
-defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">;
-defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">;
-defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">;
-defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">;
-defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
+defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">;
+defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">;
+defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">;
+defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">;
+defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">;
+defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">;
+defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">;
+defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">;
+defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">;
+defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">;
+defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">;
+defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">;
+defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">;
+defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">;
+defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">;
+defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
@@ -505,26 +518,26 @@ defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">;
-defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">;
-defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">;
-defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">;
-defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">;
-defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">;
-defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">;
-defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">;
-defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">;
-defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">;
-defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">;
-defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">;
-defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">;
-defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">;
-defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">;
-defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">;
+defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">;
+defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">;
+defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">;
+defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">;
+defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">;
+defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">;
+defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">;
+defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">;
+defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">;
+defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">;
+defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">;
+defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">;
+defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">;
+defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">;
+defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">;
+defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
@@ -543,26 +556,26 @@ defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">;
-defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">;
-defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">;
-defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">;
-defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">;
-defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">;
-defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">;
-defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">;
-defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">;
-defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">;
-defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">;
-defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">;
-defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">;
-defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
-defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">;
-defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">;
+defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">;
+defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">;
+defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">;
+defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">;
+defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">;
+defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">;
+defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">;
+defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">;
+defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">;
+defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">;
+defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">;
+defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
+defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">;
+defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
@@ -611,18 +624,18 @@ defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">;
-defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">;
-defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">;
-defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">;
-defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">;
-defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">;
-defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">;
-defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
+defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">;
+defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">;
+defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">;
+defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">;
+defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">;
+defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">;
+defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">;
+defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
@@ -633,18 +646,18 @@ defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">;
-defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">;
-defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">;
-defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">;
-defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">;
-defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">;
-defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">;
-defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
+defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">;
+defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">;
+defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">;
+defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">;
+defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">;
+defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">;
+defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">;
+defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
@@ -655,18 +668,18 @@ defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">;
-defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">;
-defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">;
-defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">;
-defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">;
-defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">;
-defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">;
-defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
+defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">;
+defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">;
+defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">;
+defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">;
+defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">;
+defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">;
+defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">;
+defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
@@ -677,30 +690,30 @@ defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
-defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">;
-defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">;
-defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">;
-defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">;
-defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">;
-defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">;
-defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">;
-defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">;
+defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">;
+defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">;
+defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">;
+defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">;
+defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">;
+defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">;
+defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">;
+defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
-let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">;
+} // End hasSideEffects = 1
defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
-let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+} // End hasSideEffects = 1
} // End isCompare = 1
@@ -708,8 +721,97 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
// DS Instructions
//===----------------------------------------------------------------------===//
-def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
-def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
+
+def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>;
+def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>;
+def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>;
+def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>;
+def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>;
+def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>;
+def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>;
+def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>;
+def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>;
+def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>;
+def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>;
+def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>;
+def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>;
+def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>;
+def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>;
+def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>;
+def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>;
+def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>;
+def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>;
+def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>;
+def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>;
+def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>;
+def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>;
+def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>;
+def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>;
+def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>;
+def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>;
+def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>;
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>;
+
+let SubtargetPredicate = isCI in {
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>;
+} // End isCI
+
+
+def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>;
+def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>;
+def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>;
+def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>;
+def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>;
+def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>;
+def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>;
+def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>;
+def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>;
+def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>;
+def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>;
+def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>;
+def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>;
+def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>;
+def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>;
+def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>;
+def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>;
+def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>;
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>;
+//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>;
+//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>;
+def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>;
+def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
+
+//let SubtargetPredicate = isCI in {
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+//} // End isCI
+
+// TODO: _SRC2_* forms
+
def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
@@ -744,32 +846,46 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMA
//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <0x00000009, "BUFFER_LOAD_SBYTE", VReg_32>;
-defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <0x0000000a, "BUFFER_LOAD_USHORT", VReg_32>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32>;
-defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
+ 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
+ 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
+ 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
+ 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
+ 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
+ 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
+ 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
+>;
def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
- 0x00000018, "BUFFER_STORE_BYTE", VReg_32
+ 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
>;
def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
- 0x0000001a, "BUFFER_STORE_SHORT", VReg_32
+ 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
>;
def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
- 0x0000001c, "BUFFER_STORE_DWORD", VReg_32
+ 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
>;
def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
- 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64
+ 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
>;
def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
- 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128
+ 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
>;
//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
@@ -885,31 +1001,31 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
-//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
-//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
-//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
-//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
-//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
-//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
-//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
-//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
-//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
-//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
-//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
-//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
-//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
-//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
-//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
-//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
-//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
-//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
-//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
-//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
-//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
-//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
-//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
-//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
-//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
+defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">;
+defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">;
+defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">;
+defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">;
+defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">;
+defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">;
+defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">;
+defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">;
+defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">;
//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
@@ -962,8 +1078,12 @@ defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
[(set i32:$dst, (fp_to_sint f32:$src0))]
>;
defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
-////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
-//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
+defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32",
+ [(set i32:$dst, (f32_to_f16 f32:$src0))]
+>;
+defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16",
+ [(set f32:$dst, (f16_to_f32 i32:$src0))]
+>;
//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
@@ -973,10 +1093,18 @@ defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
[(set f64:$dst, (fextend f32:$src0))]
>;
-//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
-//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
-//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
-//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
+defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0",
+ [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1",
+ [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2",
+ [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3",
+ [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))]
+>;
defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
[(set i32:$dst, (fp_to_uint f64:$src0))]
>;
@@ -988,7 +1116,7 @@ defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
[(set f32:$dst, (AMDGPUfract f32:$src0))]
>;
defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
- [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))]
+ [(set f32:$dst, (ftrunc f32:$src0))]
>;
defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
[(set f32:$dst, (fceil f32:$src0))]
@@ -1006,24 +1134,33 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
[(set f32:$dst, (flog2 f32:$src0))]
>;
+
defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
- [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
+ [(set f32:$dst, (AMDGPUrcp f32:$src0))]
>;
defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
-defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
+ [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
+>;
defm V_RSQ_LEGACY_F32 : VOP1_32 <
0x0000002d, "V_RSQ_LEGACY_F32",
- [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
+ [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
+>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
+ [(set f32:$dst, (AMDGPUrsq f32:$src0))]
>;
-defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
- [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+ [(set f64:$dst, (AMDGPUrcp f64:$src0))]
>;
defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
-defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
-defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
+ [(set f64:$dst, (AMDGPUrsq f64:$src0))]
+>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
+ [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
+>;
defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
[(set f32:$dst, (fsqrt f32:$src0))]
>;
@@ -1211,7 +1348,7 @@ defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
-//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
@@ -1303,16 +1440,20 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
-def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
+ [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
+ [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
+>;
-def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
+def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
[(set i64:$dst, (shl i64:$src0, i32:$src1))]
>;
-def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
+def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64",
[(set i64:$dst, (srl i64:$src0, i32:$src1))]
>;
-def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64",
+def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64",
[(set i64:$dst, (sra i64:$src0, i32:$src1))]
>;
@@ -1336,14 +1477,23 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
} // isCommutable = 1
-defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
-def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
-defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
-def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
+def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+
+// Double precision division pre-scale.
+def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
+ [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
+ [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
+>;
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
-def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
+ [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
+>;
//===----------------------------------------------------------------------===//
// Pseudo Instructions
@@ -1500,7 +1650,7 @@ let usesCustomInserter = 1 in {
// constant that can be used with the ADDR64 MUBUF instructions.
def SI_ADDR64_RSRC : InstSI <
(outs SReg_128:$srsrc),
- (ins SReg_64:$ptr),
+ (ins SSrc_64:$ptr),
"", []
>;
@@ -1508,7 +1658,7 @@ def V_SUB_F64 : InstSI <
(outs VReg_64:$dst),
(ins VReg_64:$src0, VReg_64:$src1),
"V_SUB_F64 $dst, $src0, $src1",
- []
+ [(set f64:$dst, (fsub f64:$src0, f64:$src1))]
>;
} // end usesCustomInserter
@@ -1529,6 +1679,7 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
}
+defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
@@ -1552,7 +1703,7 @@ def : Pat <
/* int_SI_vs_load_input */
def : Pat<
- (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+ (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
(BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
>;
@@ -1564,11 +1715,6 @@ def : Pat <
$src0, $src1, $src2, $src3)
>;
-def : Pat <
- (f64 (fsub f64:$src0, f64:$src1)),
- (V_SUB_F64 $src0, $src1)
->;
-
//===----------------------------------------------------------------------===//
// SMRD Patterns
//===----------------------------------------------------------------------===//
@@ -1596,7 +1742,6 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
@@ -1615,6 +1760,24 @@ def : Pat <
(S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
>;
+} // Predicates = [isSI] in {
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isSI, isCFDepth0] in {
+
+def : Pat <
+ (i64 (ctpop i64:$src)),
+ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (S_BCNT1_I32_B64 $src), sub0),
+ (S_MOV_B32 0), sub1)
+>;
+
+} // Predicates = [isSI, isCFDepth0]
+
+let Predicates = [isSI] in {
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
@@ -1625,18 +1788,39 @@ def : Pat <
>;
//===----------------------------------------------------------------------===//
-// VOP2 Patterns
+// SOPP Patterns
//===----------------------------------------------------------------------===//
def : Pat <
- (or i64:$src0, i64:$src1),
+ (int_AMDGPU_barrier_global),
+ (S_BARRIER)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+def : RcpPat<V_RCP_F32_e32, f32>;
+def : RcpPat<V_RCP_F64_e32, f64>;
+defm : RsqPat<V_RSQ_F32_e32, f32>;
+defm : RsqPat<V_RSQ_F64_e32, f64>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+class BinOp64Pat <SDNode node, Instruction inst> : Pat <
+ (node i64:$src0, i64:$src1),
(INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0),
+ (inst (EXTRACT_SUBREG i64:$src0, sub0),
(EXTRACT_SUBREG i64:$src1, sub0)), sub0),
- (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1),
+ (inst (EXTRACT_SUBREG i64:$src0, sub1),
(EXTRACT_SUBREG i64:$src1, sub1)), sub1)
>;
+def : BinOp64Pat <or, V_OR_B32_e32>;
+def : BinOp64Pat <xor, V_XOR_B32_e32>;
+
class SextInReg <ValueType vt, int ShiftAmt> : Pat <
(sext_inreg i32:$src0, vt),
(V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
@@ -1645,10 +1829,82 @@ class SextInReg <ValueType vt, int ShiftAmt> : Pat <
def : SextInReg <i8, 24>;
def : SextInReg <i16, 16>;
+def : Pat <
+ (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+ (V_BCNT_U32_B32_e32 $popcnt, $val)
+>;
+
+def : Pat <
+ (i32 (ctpop i32:$popcnt)),
+ (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0)
+>;
+
+def : Pat <
+ (i64 (ctpop i64:$src)),
+ (INSERT_SUBREG
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1),
+ (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)),
+ sub0),
+ (V_MOV_B32_e32 0), sub1)
+>;
+
/********** ======================= **********/
/********** Image sampling patterns **********/
/********** ======================= **********/
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm,
+ i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+ (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+ (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+ $addr, $rsrc, $sampler)
+>;
+
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
/* SIsample for simple 1D texture lookup */
def : Pat <
(SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
@@ -1864,7 +2120,10 @@ def : BitConvert <v2f32, v2i32, VReg_64>;
def : BitConvert <v2i32, v2f32, VReg_64>;
def : BitConvert <v2i32, i64, VReg_64>;
def : BitConvert <i64, v2i32, VReg_64>;
-
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v4f32, v4i32, VReg_128>;
def : BitConvert <v4i32, v4f32, VReg_128>;
@@ -1894,7 +2153,7 @@ def FCLAMP_SI : AMDGPUShaderInst <
}
def : Pat <
- (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+ (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
(FCLAMP_SI f32:$src)
>;
@@ -2106,7 +2365,7 @@ def : Pat <
(V_MUL_HI_I32 $src0, $src1, (i32 0))
>;
-defm : BFIPatterns <V_BFI_B32>;
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32>;
def : ROTRPattern <V_ALIGNBIT_B32>;
/********** ======================= **********/
@@ -2130,7 +2389,7 @@ defm : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>;
defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
defm : DSReadPat <DS_READ_B32, i32, local_load>;
-defm : DSReadPat <DS_READ_B64, i64, local_load>;
+defm : DSReadPat <DS_READ_B64, v2i32, local_load>;
multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
def : Pat <
@@ -2139,48 +2398,109 @@ multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
>;
def : Pat <
- (frag vt:$src1, i32:$src0),
- (inst 0, $src0, $src1, 0)
+ (frag vt:$val, i32:$ptr),
+ (inst 0, $ptr, $val, 0)
>;
}
defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
-defm : DSWritePat <DS_WRITE_B64, i64, local_store>;
+defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>;
-def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
- (DS_ADD_U32_RTN 0, $ptr, $val, 0)>;
-
-def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
- (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
+multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
+ def : Pat <
+ (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value),
+ (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+ >;
-//===----------------------------------------------------------------------===//
-// MUBUF Patterns
-//===----------------------------------------------------------------------===//
+ def : Pat <
+ (frag i32:$ptr, vt:$val),
+ (inst 0, $ptr, $val, 0)
+ >;
+}
-multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
- PatFrag global_ld, PatFrag constant_ld> {
+// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
+//
+// We need to use something for the data0, so we set a register to
+// -1. For the non-rtn variants, the manual says it does
+// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
+// will always do the increment so I'm assuming it's the same.
+//
+// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
+// needs to be a VGPR. The SGPR copy pass will fix this, and it's
+// easier since there is no v_mov_b64.
+multiclass DSAtomicIncRetPat<DS inst, ValueType vt,
+ Instruction LoadImm, PatFrag frag> {
def : Pat <
- (vt (global_ld (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset))),
- (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset))
+ (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)),
+ (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
>;
def : Pat <
- (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))),
- (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
+ (frag i32:$ptr, (vt 1)),
+ (inst 0, $ptr, (LoadImm (vt -1)), 0)
>;
+}
+multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> {
def : Pat <
- (vt (global_ld i64:$ptr)),
- (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+ (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap),
+ (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
>;
def : Pat <
- (vt (global_ld (add i64:$ptr, i64:$offset))),
- (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+ (frag i32:$ptr, vt:$cmp, vt:$swap),
+ (inst 0, $ptr, $cmp, $swap, 0)
>;
+}
+
+
+// 32-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+ S_MOV_B32, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+ S_MOV_B32, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+ S_MOV_B64, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+ S_MOV_B64, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+ PatFrag constant_ld> {
def : Pat <
(vt (constant_ld (add i64:$ptr, i64:$offset))),
(Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
@@ -2188,53 +2508,19 @@ multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
}
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
- sextloadi8_global, sextloadi8_constant>;
+ sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
- az_extloadi8_global, az_extloadi8_constant>;
+ az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
- sextloadi16_global, sextloadi16_constant>;
+ sextloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
- az_extloadi16_global, az_extloadi16_constant>;
+ az_extloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
- global_load, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64,
- global_load, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64,
- az_extloadi32_global, az_extloadi32_constant>;
+ constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
- global_load, constant_load>;
+ constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
- global_load, constant_load>;
-
-multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> {
-
- def : Pat <
- (st vt:$value, (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset)),
- (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset))
- >;
-
- def : Pat <
- (st vt:$value, (add i64:$ptr, IMM12bit:$offset)),
- (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
- >;
-
- def : Pat <
- (st vt:$value, i64:$ptr),
- (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
- >;
-
- def : Pat <
- (st vt:$value, (add i64:$ptr, i64:$offset)),
- (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
- >;
-}
-
-defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>;
+ constant_load>;
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
@@ -2301,7 +2587,7 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
-let Predicates = [isCI] in {
+let SubtargetPredicate = isCI in {
// Sea island new arithmetic instructinos
let neverHasSideEffects = 1 in {
@@ -2348,7 +2634,7 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
// BUFFER_LOAD_DWORDX3
// BUFFER_STORE_DWORDX3
-} // End Predicates = [isCI]
+} // End iSCI
/********** ====================== **********/
@@ -2360,13 +2646,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I
// 1. Extract with offset
def : Pat<
(vector_extract vt:$vec, (add i32:$idx, imm:$off)),
- (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
+ (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
>;
// 2. Extract without offset
def : Pat<
(vector_extract vt:$vec, i32:$idx),
- (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
+ (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
>;
// 3. Insert with offset
@@ -2392,20 +2678,6 @@ defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
-/********** =============== **********/
-/********** Conditions **********/
-/********** =============== **********/
-
-def : Pat<
- (i1 (setcc f32:$src0, f32:$src1, SETO)),
- (V_CMP_O_F32_e64 $src0, $src1)
->;
-
-def : Pat<
- (i1 (setcc f32:$src0, f32:$src1, SETUO)),
- (V_CMP_U_F32_e64 $src0, $src1)
->;
-
//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
@@ -2439,6 +2711,62 @@ def : Pat <
(S_MOV_B32 -1), sub1)
>;
+class ZExt_i64_i32_Pat <SDNode ext> : Pat <
+ (i64 (ext i32:$src)),
+ (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+ (S_MOV_B32 0), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+ (i64 (ext i1:$src)),
+ (INSERT_SUBREG
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0),
+ (S_MOV_B32 0), sub1)
+>;
+
+
+def : ZExt_i64_i32_Pat<zext>;
+def : ZExt_i64_i32_Pat<anyext>;
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+def : Pat <
+ (i64 (sext i32:$src)),
+ (INSERT_SUBREG
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+ (S_ASHR_I32 $src, 31), sub1)
+>;
+
+def : Pat <
+ (i64 (sext i1:$src)),
+ (INSERT_SUBREG
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)),
+ (V_CNDMASK_B32_e64 0, -1, $src), sub0),
+ (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+>;
+
+def : Pat <
+ (f32 (sint_to_fp i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+>;
+
+def : Pat <
+ (f32 (uint_to_fp i1:$src)),
+ (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+>;
+
+def : Pat <
+ (f64 (sint_to_fp i1:$src)),
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+ (f64 (uint_to_fp i1:$src)),
+ (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 00e32c0..df690a4 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -56,11 +56,61 @@ let TargetPrefix = "SI", isTarget = 1 in {
class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+ // Fully-flexible SAMPLE instruction.
+ class SampleRaw : Intrinsic <
+ [llvm_v4f32_ty], // vdata(VGPR)
+ [llvm_anyint_ty, // vaddr(VGPR)
+ llvm_v32i8_ty, // rsrc(SGPR)
+ llvm_v16i8_ty, // sampler(SGPR)
+ llvm_i32_ty, // dmask(imm)
+ llvm_i32_ty, // unorm(imm)
+ llvm_i32_ty, // r128(imm)
+ llvm_i32_ty, // da(imm)
+ llvm_i32_ty, // glc(imm)
+ llvm_i32_ty, // slc(imm)
+ llvm_i32_ty, // tfe(imm)
+ llvm_i32_ty], // lwe(imm)
+ [IntrNoMem]>;
+
def int_SI_sample : Sample;
def int_SI_sampleb : Sample;
def int_SI_sampled : Sample;
def int_SI_samplel : Sample;
+ // Basic gather4
+ def int_SI_gather4 : SampleRaw;
+ def int_SI_gather4_cl : SampleRaw;
+ def int_SI_gather4_l : SampleRaw;
+ def int_SI_gather4_b : SampleRaw;
+ def int_SI_gather4_b_cl : SampleRaw;
+ def int_SI_gather4_lz : SampleRaw;
+
+ // Gather4 with comparison
+ def int_SI_gather4_c : SampleRaw;
+ def int_SI_gather4_c_cl : SampleRaw;
+ def int_SI_gather4_c_l : SampleRaw;
+ def int_SI_gather4_c_b : SampleRaw;
+ def int_SI_gather4_c_b_cl : SampleRaw;
+ def int_SI_gather4_c_lz : SampleRaw;
+
+ // Gather4 with offsets
+ def int_SI_gather4_o : SampleRaw;
+ def int_SI_gather4_cl_o : SampleRaw;
+ def int_SI_gather4_l_o : SampleRaw;
+ def int_SI_gather4_b_o : SampleRaw;
+ def int_SI_gather4_b_cl_o : SampleRaw;
+ def int_SI_gather4_lz_o : SampleRaw;
+
+ // Gather4 with comparison and offsets
+ def int_SI_gather4_c_o : SampleRaw;
+ def int_SI_gather4_c_cl_o : SampleRaw;
+ def int_SI_gather4_c_l_o : SampleRaw;
+ def int_SI_gather4_c_b_o : SampleRaw;
+ def int_SI_gather4_c_b_cl_o : SampleRaw;
+ def int_SI_gather4_c_lz_o : SampleRaw;
+
+ def int_SI_getlod : SampleRaw;
+
def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 6601f2a..9f5ff29 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -86,6 +86,7 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
+ void InitM0ForLDS(MachineBasicBlock::iterator MI);
void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
void IndirectSrc(MachineInstr &MI);
void IndirectDst(MachineInstr &MI);
@@ -320,6 +321,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MI.eraseFromParent();
}
+/// The m0 register stores the maximum allowable address for LDS reads and
+/// writes. Its value must be at least the size in bytes of LDS allocated by
+/// the shader. For simplicity, we set it to the maximum possible value.
+void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+ AMDGPU::M0).addImm(0xffffffff);
+}
+
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
MachineBasicBlock &MBB = *MI.getParent();
@@ -333,52 +342,57 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(Idx);
MBB.insert(I, MovRel);
- MI.eraseFromParent();
- return;
- }
+ } else {
- assert(AMDGPU::SReg_64RegClass.contains(Save));
- assert(AMDGPU::VReg_32RegClass.contains(Idx));
+ assert(AMDGPU::SReg_64RegClass.contains(Save));
+ assert(AMDGPU::VReg_32RegClass.contains(Idx));
- // Save the EXEC mask
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
- .addReg(AMDGPU::EXEC);
+ // Save the EXEC mask
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+ .addReg(AMDGPU::EXEC);
- // Read the next variant into VCC (lower 32 bits) <- also loop target
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- AMDGPU::VCC_LO)
- .addReg(Idx);
+ // Read the next variant into VCC (lower 32 bits) <- also loop target
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ AMDGPU::VCC_LO)
+ .addReg(Idx);
- // Move index from VCC into M0
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(AMDGPU::VCC_LO);
+ // Move index from VCC into M0
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(AMDGPU::VCC_LO);
- // Compare the just read M0 value to all possible Idx values
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
- .addReg(AMDGPU::M0)
- .addReg(Idx);
+ // Compare the just read M0 value to all possible Idx values
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+ .addReg(AMDGPU::M0)
+ .addReg(Idx);
- // Update EXEC, save the original EXEC value to VCC
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
+ // Update EXEC, save the original EXEC value to VCC
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+ .addReg(AMDGPU::VCC);
- // Do the actual move
- MBB.insert(I, MovRel);
+ // Do the actual move
+ MBB.insert(I, MovRel);
- // Update EXEC, switch all done bits to 0 and all todo bits to 1
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
- // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(-7)
- .addReg(AMDGPU::EXEC);
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addImm(-7)
+ .addReg(AMDGPU::EXEC);
- // Restore EXEC
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(Save);
+ // Restore EXEC
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(Save);
+ }
+ // FIXME: Are there any values other than the LDS address clamp that need to
+ // be stored in the m0 register and may be live for more than a few
+ // instructions? If so, we should save the m0 register at the beginning
+ // of this function and restore it here.
+ // FIXME: Add support for LDS direct loads.
+ InitM0ForLDS(&MI);
MI.eraseFromParent();
}
@@ -523,8 +537,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock &MBB = MF.front();
// Initialize M0 to a value that won't cause LDS access to be discarded
// due to offset clamping
- BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::M0).addImm(0xffffffff);
+ InitM0ForLDS(MBB.getFirstNonPHI());
}
if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index af60995..e2df950 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -62,8 +62,10 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
return VGPR;
}
}
- MF->getFunction()->getContext().emitError(
- "Could not found S_ENGPGM instrtuction.");
+
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("Could not find S_ENDPGM instruction.");
+
return VGPR;
}
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index c72d549..d0b677a 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -14,21 +14,20 @@
#include "SIRegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
using namespace llvm;
-SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
-: AMDGPURegisterInfo(tm),
- TM(tm)
+SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
{ }
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::EXEC);
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
- const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
TII->reserveIndirectRegisters(Reserved, MF);
return Reserved;
}
@@ -38,15 +37,6 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return RC->getNumRegs();
}
-const TargetRegisterClass *
-SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
- switch (rc->getID()) {
- case AMDGPU::GPRF32RegClassID:
- return &AMDGPU::VReg_32RegClass;
- default: return rc;
- }
-}
-
const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
@@ -135,3 +125,19 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
unsigned Index = getHWRegIndex(Reg);
return SubRC->getRegister(Index + Channel);
}
+
+bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const {
+ switch (RCID) {
+ default: return false;
+ case AMDGPU::SSrc_32RegClassID:
+ case AMDGPU::SSrc_64RegClassID:
+ case AMDGPU::VSrc_32RegClassID:
+ case AMDGPU::VSrc_64RegClassID:
+ return true;
+ }
+}
+
+bool SIRegisterInfo::regClassCanUseImmediate(
+ const TargetRegisterClass *RC) const {
+ return regClassCanUseImmediate(RC->getID());
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 36b4fcd..c9305fb 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -20,24 +20,15 @@
namespace llvm {
-class AMDGPUTargetMachine;
-
struct SIRegisterInfo : public AMDGPURegisterInfo {
- AMDGPUTargetMachine &TM;
- SIRegisterInfo(AMDGPUTargetMachine &tm);
+ SIRegisterInfo(const AMDGPUSubtarget &st);
BitVector getReservedRegs(const MachineFunction &MF) const override;
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- /// \param RC is an AMDIL reg class.
- ///
- /// \returns the SI register class that is equivalent to \p RC.
- const TargetRegisterClass *
- getISARegClass(const TargetRegisterClass *RC) const override;
-
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
@@ -69,6 +60,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
/// \returns The sub-register of Reg that is in Channel.
unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
unsigned Channel) const;
+
+ /// \returns True if operands defined with this register class can accept
+ /// inline immediates.
+ bool regClassCanUseImmediate(int RCID) const;
+
+ /// \returns True if operands defined with this register class can accept
+ /// inline immediates.
+ bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
};
} // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index f1f01de..8974b63 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
(add SGPR_64Regs, VCCReg, EXECReg)
>;
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index a0b6907..367963a 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -119,8 +119,7 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
Type::getInt32Ty(I.getContext())){
Type *ElementTy = Arg->getType()->getVectorElementType();
std::string TypeName = "i32";
- InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg);
- assert(Def);
+ InsertElementInst *Def = cast<InsertElementInst>(Arg);
Args.push_back(Def->getOperand(1));
Types.push_back(ElementTy);
std::string VecTypeName = "v1" + TypeName;