diff options
Diffstat (limited to 'lib/Target/R600')
55 files changed, 3204 insertions, 866 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index e099a9f..9792bd8 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -23,6 +23,9 @@ class AMDGPUTargetMachine; // R600 Passes FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm); +FunctionPass *createR600Packetizer(TargetMachine &tm); +FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index f600144..c915f50 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -19,9 +19,15 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" +#include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -50,15 +56,57 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (OutStreamer.hasRawTextSupport()) { OutStreamer.EmitRawText("@" + MF.getName() + ":"); } - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + + const MCSectionELF *ConfigSection = getObjFileLowering().getContext() + .getELFSection(".AMDGPU.config", + ELF::SHT_PROGBITS, 0, + SectionKind::getReadOnly()); + OutStreamer.SwitchSection(ConfigSection); if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { - EmitProgramInfo(MF); + EmitProgramInfoSI(MF); + } else { + EmitProgramInfoR600(MF); } + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); EmitFunctionBody(); return false; } -void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { +void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { + unsigned MaxGPR = 0; + bool killPixel = false; + const R600RegisterInfo * RI = + static_cast<const R600RegisterInfo*>(TM.getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::KILLGT) + killPixel = true; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + MachineOperand & MO = MI.getOperand(op_idx); + if (!MO.isReg()) + continue; + unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + + // Register with value > 127 aren't GPR + if (HWReg > 127) + continue; + MaxGPR = std::max(MaxGPR, HWReg); + } + } + } + OutStreamer.EmitIntValue(MaxGPR + 1, 4); + OutStreamer.EmitIntValue(MFI->StackSize, 4); + OutStreamer.EmitIntValue(killPixel, 4); +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) { unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; @@ -107,6 +155,9 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { } else if (AMDGPU::VReg_64RegClass.contains(reg)) { isSGPR = false; width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(reg)) { + isSGPR = false; + width = 3; } else if (AMDGPU::SReg_128RegClass.contains(reg)) { isSGPR = true; width = 4; @@ -139,7 +190,19 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { MaxSGPR += 2; } SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); - OutStreamer.EmitIntValue(MaxSGPR + 1, 4); - OutStreamer.EmitIntValue(MaxVGPR + 1, 4); - OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); + unsigned RsrcReg; + switch (MFI->ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; + case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; + case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break; + case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; + } + + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4); + if (MFI->ShaderType == ShaderType::PIXEL) { + OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); + } } diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 3812282..f425ef4 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -33,7 +33,8 @@ public: /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. - void EmitProgramInfo(MachineFunction &MF); + void EmitProgramInfoR600(MachineFunction &MF); + void EmitProgramInfoSI(MachineFunction &MF); /// Implemented in AMDGPUMCInstLower.cpp virtual void EmitInstruction(const MachineInstr *MI); diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 45ae37e..9c30515 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -32,8 +32,14 @@ def CC_SI : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 - ]>>> + ]>>>, + // This is the default for i64 values. + // XXX: We should change this once clang understands the CC_AMDGPU. + CCIfType<[i64], CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + >> ]>; def CC_AMDGPU : CallingConv<[ diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 5995b6f..a266df5 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -60,6 +60,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index f31b646..c2a79ea 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -116,6 +116,7 @@ enum { BRANCH_COND, // End AMDIL ISD Opcodes BITALIGN, + BUFFER_STORE, DWORDADDR, FRACT, FMAX, diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index e740348..83e1359 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -94,6 +94,7 @@ class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; +int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding } def CONST : Constants; @@ -115,21 +116,21 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "FABS $dst, $src0", - [(set rc:$dst, (fabs rc:$src0))] + [(set f32:$dst, (fabs f32:$src0))] >; class FNEG <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "FNEG $dst, $src0", - [(set rc:$dst, (fneg rc:$src0))] + [(set f32:$dst, (fneg f32:$src0))] >; } // usesCustomInserter = 1 @@ -140,8 +141,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, (outs dstClass:$dst), (ins addrClass:$addr, i32imm:$chan), "RegisterLoad $dst, $addr", - [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr, - (i32 timm:$chan)))] + [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] > { let isRegisterLoad = 1; } @@ -150,7 +150,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, (outs), (ins dstClass:$val, addrClass:$addr, i32imm:$chan), "RegisterStore $val, $addr", - [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))] + [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] > { let isRegisterStore = 1; } @@ -161,105 +161,121 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, /* Generic helper patterns for intrinsics */ /* -------------------------------------- */ -class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul, - RegisterClass rc> : Pat < - (fpow rc:$src0, rc:$src1), - (exp_ieee (mul rc:$src1, (log_ieee rc:$src0))) +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> + : Pat < + (fpow f32:$src0, f32:$src1), + (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) >; /* Other helper patterns */ /* --------------------- */ /* Extract element pattern */ -class Extract_Element <ValueType sub_type, ValueType vec_type, - RegisterClass vec_class, int sub_idx, - SubRegIndex sub_reg>: Pat< - (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)), - (EXTRACT_SUBREG vec_class:$src, sub_reg) +class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, + SubRegIndex sub_reg> + : Pat< + (sub_type (vector_extract vec_type:$src, sub_idx)), + (EXTRACT_SUBREG $src, sub_reg) >; /* Insert element pattern */ class Insert_Element <ValueType elem_type, ValueType vec_type, - RegisterClass elem_class, RegisterClass vec_class, - int sub_idx, SubRegIndex sub_reg> : Pat < - - (vec_type (vector_insert (vec_type vec_class:$vec), - (elem_type elem_class:$elem), sub_idx)), - (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg) + int sub_idx, SubRegIndex sub_reg> + : Pat < + (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (INSERT_SUBREG $vec, $elem, sub_reg) >; // Vector Build pattern -class Vector1_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$src))), - (vecType elemClass:$src) +class Vector1_Build <ValueType vecType, ValueType elemType, + RegisterClass rc> : Pat < + (vecType (build_vector elemType:$src)), + (vecType (COPY_TO_REGCLASS $src, rc)) >; -class Vector2_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))), +class Vector2_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$sub0, elemType:$sub1)), (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1) + (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1) >; -class Vector4_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y), - (elemType elemClass:$z), (elemType elemClass:$w))), +class Vector4_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1), - elemClass:$z, sub2), elemClass:$w, sub3) + (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3) >; -class Vector8_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1), - (elemType elemClass:$sub2), (elemType elemClass:$sub3), - (elemType elemClass:$sub4), (elemType elemClass:$sub5), - (elemType elemClass:$sub6), (elemType elemClass:$sub7))), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG +class Vector8_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$sub0, elemType:$sub1, + elemType:$sub2, elemType:$sub3, + elemType:$sub4, elemType:$sub5, + elemType:$sub6, elemType:$sub7)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1), - elemClass:$sub2, sub2), elemClass:$sub3, sub3), - elemClass:$sub4, sub4), elemClass:$sub5, sub5), - elemClass:$sub6, sub6), elemClass:$sub7, sub7) + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1), + $sub2, sub2), $sub3, sub3), + $sub4, sub4), $sub5, sub5), + $sub6, sub6), $sub7, sub7) >; -class Vector16_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1), - (elemType elemClass:$sub2), (elemType elemClass:$sub3), - (elemType elemClass:$sub4), (elemType elemClass:$sub5), - (elemType elemClass:$sub6), (elemType elemClass:$sub7), - (elemType elemClass:$sub8), (elemType elemClass:$sub9), - (elemType elemClass:$sub10), (elemType elemClass:$sub11), - (elemType elemClass:$sub12), (elemType elemClass:$sub13), - (elemType elemClass:$sub14), (elemType elemClass:$sub15))), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG +class Vector16_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$sub0, elemType:$sub1, + elemType:$sub2, elemType:$sub3, + elemType:$sub4, elemType:$sub5, + elemType:$sub6, elemType:$sub7, + elemType:$sub8, elemType:$sub9, + elemType:$sub10, elemType:$sub11, + elemType:$sub12, elemType:$sub13, + elemType:$sub14, elemType:$sub15)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1), - elemClass:$sub2, sub2), elemClass:$sub3, sub3), - elemClass:$sub4, sub4), elemClass:$sub5, sub5), - elemClass:$sub6, sub6), elemClass:$sub7, sub7), - elemClass:$sub8, sub8), elemClass:$sub9, sub9), - elemClass:$sub10, sub10), elemClass:$sub11, sub11), - elemClass:$sub12, sub12), elemClass:$sub13, sub13), - elemClass:$sub14, sub14), elemClass:$sub15, sub15) + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1), + $sub2, sub2), $sub3, sub3), + $sub4, sub4), $sub5, sub5), + $sub6, sub6), $sub7, sub7), + $sub8, sub8), $sub9, sub9), + $sub10, sub10), $sub11, sub11), + $sub12, sub12), $sub13, sub13), + $sub14, sub14), $sub15, sub15) >; +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. // bitconvert pattern class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat < (dt (bitconvert (st rc:$src0))), (dt rc:$src0) >; +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < (vt (AMDGPUdwordaddr (vt rc:$addr))), (vt rc:$addr) >; +// BFI_INT patterns + +multiclass BFIPatterns <Instruction BFI_INT> { + + // Definition from ISA doc: + // (y & x) | (z & ~x) + def : Pat < + (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), + (BFI_INT $x, $y, $z) + >; + + // SHA-256 Ch function + // z ^ (x & (y ^ z)) + def : Pat < + (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), + (BFI_INT $x, $y, $z) + >; + +} + include "R600Instructions.td" include "SIInstrInfo.td" diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp new file mode 100644 index 0000000..0461025 --- /dev/null +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -0,0 +1,24 @@ +#include "AMDGPUMachineFunction.h" +#include "AMDGPU.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" + +namespace llvm { + +const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType"; + +AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : + MachineFunctionInfo() { + ShaderType = ShaderType::COMPUTE; + AttributeSet Set = MF.getFunction()->getAttributes(); + Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, + ShaderTypeAttribute); + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) + llvm_unreachable("Can't parse shader type!"); + } +} + +} diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h new file mode 100644 index 0000000..21c8c51 --- /dev/null +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -0,0 +1,29 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUMACHINEFUNCTION_H +#define AMDGPUMACHINEFUNCTION_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +class AMDGPUMachineFunction : public MachineFunctionInfo { +private: + static const char *ShaderTypeAttribute; +public: + AMDGPUMachineFunction(const MachineFunction &MF); + unsigned ShaderType; +}; + +} +#endif // AMDGPUMACHINEFUNCTION_H diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp index b723433..dea43b8 100644 --- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp +++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp @@ -17,6 +17,7 @@ #include "AMDGPU.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/MapVector.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" @@ -40,13 +41,14 @@ typedef SmallVector<BBValuePair, 2> BBValueVector; typedef SmallPtrSet<BasicBlock *, 8> BBSet; -typedef DenseMap<PHINode *, BBValueVector> PhiMap; +typedef MapVector<PHINode *, BBValueVector> PhiMap; +typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap; + typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap; typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap; typedef DenseMap<BasicBlock *, Value *> BBPredicates; typedef DenseMap<BasicBlock *, BBPredicates> PredMap; typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap; -typedef DenseMap<BasicBlock *, BBVector> BB2BBVecMap; // The name for newly created blocks. diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 0f356a1..a7e1d7b 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -33,6 +33,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : DefaultSize[0] = 64; DefaultSize[1] = 1; DefaultSize[2] = 1; + HasVertexCache = false; ParseSubtargetFeatures(GPU, FS); DevName = GPU; Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit); @@ -53,6 +54,10 @@ AMDGPUSubtarget::is64bit() const { return Is64bit; } bool +AMDGPUSubtarget::hasVertexCache() const { + return HasVertexCache; +} +bool AMDGPUSubtarget::isTargetELF() const { return false; } diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 1973fc6..b6501a4 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -36,6 +36,7 @@ private: bool Is32on64bit; bool DumpCode; bool R600ALUInst; + bool HasVertexCache; InstrItineraryData InstrItins; @@ -48,6 +49,7 @@ public: bool isOverride(AMDGPUDeviceInfo::Caps) const; bool is64bit() const; + bool hasVertexCache() const; // Helper functions to simplify if statements bool isTargetELF() const; diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 0185747..0ec67ce 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -151,8 +151,11 @@ bool AMDGPUPassConfig::addPreEmitPass() { if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); + addPass(createR600EmitClauseMarkers(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(&FinalizeMachineBundlesID); + addPass(createR600Packetizer(*TM)); + addPass(createR600ControlFlowFinalizer(*TM)); } else { addPass(createSILowerControlFlowPass(*TM)); } diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td index c12cedc..e221110 100644 --- a/lib/Target/R600/AMDILBase.td +++ b/lib/Target/R600/AMDILBase.td @@ -74,6 +74,10 @@ def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", "false", "Older version of ALU instructions encoding.">; +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache.">; //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp index 9605fbe..1787959 100644 --- a/lib/Target/R600/AMDILDeviceInfo.cpp +++ b/lib/Target/R600/AMDILDeviceInfo.cpp @@ -44,7 +44,7 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName, " on 32bit pointers!"); #endif return new AMDGPUEvergreenDevice(ptr); - } else if (deviceName == "redwood") { + } else if (deviceName == "redwood" || deviceName == "sumo") { #if DEBUG assert(!is64bit && "This device does not support 64bit pointers!"); assert(!is64on32bit && "This device does not support 64bit" @@ -79,7 +79,9 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName, " on 32bit pointers!"); #endif return new AMDGPUNIDevice(ptr); - } else if (deviceName == "SI") { + } else if (deviceName == "SI" || + deviceName == "tahiti" || deviceName == "pitcairn" || + deviceName == "verde" || deviceName == "oland") { return new AMDGPUSIDevice(ptr); } else { #if DEBUG diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index fa8f62d..ba75a44 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -191,6 +191,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs, 2 * N->getNumOperands() + 1); } + case ISD::BUILD_PAIR: { + SDValue RC, SubReg0, SubReg1; + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + break; + } + if (N->getValueType(0) == MVT::i128) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32); + } else if (N->getValueType(0) == MVT::i64) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + } else { + llvm_unreachable("Unhandled value type for BUILD_PAIR"); + } + const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, + N->getOperand(1), SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + N->getDebugLoc(), N->getValueType(0), Ops); + } + case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 63c59e1..2ad2047 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -27,6 +27,7 @@ add_llvm_target(R600CodeGen AMDGPUFrameLowering.cpp AMDGPUIndirectAddressing.cpp AMDGPUMCInstLower.cpp + AMDGPUMachineFunction.cpp AMDGPUSubtarget.cpp AMDGPUStructurizeCFG.cpp AMDGPUTargetMachine.cpp @@ -34,11 +35,14 @@ add_llvm_target(R600CodeGen AMDGPUConvertToISA.cpp AMDGPUInstrInfo.cpp AMDGPURegisterInfo.cpp + R600ControlFlowFinalizer.cpp + R600EmitClauseMarkers.cpp R600ExpandSpecialInstrs.cpp R600InstrInfo.cpp R600ISelLowering.cpp R600MachineFunctionInfo.cpp R600MachineScheduler.cpp + R600Packetizer.cpp R600RegisterInfo.cpp SIAnnotateControlFlow.cpp SIInsertWaits.cpp diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index 98fca43..a3397f3 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -44,7 +44,6 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend() {} - virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const; virtual unsigned getNumFixupKinds() const { return 0; }; virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const; @@ -71,16 +70,6 @@ void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm, } } -MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT, - StringRef CPU) { - return new AMDGPUAsmBackend(T); -} - -AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter( - raw_ostream &OS) const { - return new AMDGPUMCObjectWriter(OS); -} - void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const { @@ -88,3 +77,21 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, assert(Fixup.getKind() == FK_PCRel_4); *Dst = (Value - 4) / 4; } + +//===----------------------------------------------------------------------===// +// ELFAMDGPUAsmBackend class +//===----------------------------------------------------------------------===// + +class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { +public: + ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createAMDGPUELFObjectWriter(OS); + } +}; + +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT, + StringRef CPU) { + return new ELFAMDGPUAsmBackend(T); +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp new file mode 100644 index 0000000..48fac9f --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -0,0 +1,39 @@ +//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" + +using namespace llvm; + +namespace { + +class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { +public: + AMDGPUELFObjectWriter(); +protected: + virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel, bool IsRelocWithSymbol, + int64_t Addend) const { + llvm_unreachable("Not implemented"); + } + +}; + + +} // End anonymous namespace + +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() + : MCELFObjectTargetWriter(false, 0, 0, false) { } + +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_ostream &OS) { + MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); + return createELFObjectWriter(MOTW, OS, true); +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 4d3d3e7..2aae26a 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -68,10 +68,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() { //===--- Dwarf Emission Directives -----------------------------------===// HasLEB128 = true; SupportsDebugInformation = true; - ExceptionsType = ExceptionHandling::None; - DwarfUsesInlineInfoSection = false; - DwarfSectionOffsetDirective = ".offset"; - } const char* diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 072ee49..45d009c 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -88,7 +88,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCCodeEmitter *_Emitter, bool RelaxAll, bool NoExecStack) { - return createPureStreamer(Ctx, MAB, _OS, _Emitter); + return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false); } extern "C" void LLVMInitializeR600TargetMC() { diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index 363a4af..09d0d5b 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -23,9 +23,11 @@ class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; +class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class Target; +class raw_ostream; extern Target TheAMDGPUTarget; @@ -41,6 +43,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT, StringRef CPU); + +MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS); } // End llvm namespace #define GET_REGINFO_ENUM diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt index 37e714c..3ccdf42 100644 --- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMR600Desc AMDGPUAsmBackend.cpp + AMDGPUELFObjectWriter.cpp AMDGPUMCTargetDesc.cpp AMDGPUMCAsmInfo.cpp R600MCCodeEmitter.cpp diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index d207160..7c83d86 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -66,8 +66,6 @@ private: void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx, raw_ostream &OS) const; void EmitDst(const MCInst &MI, raw_ostream &OS) const; - void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, - raw_ostream &OS) const; void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const; void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const; @@ -103,7 +101,8 @@ enum InstrTypes { INSTR_FC, INSTR_NATIVE, INSTR_VTX, - INSTR_EXPORT + INSTR_EXPORT, + INSTR_CFALU }; enum FCInstr { @@ -140,11 +139,11 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const { - if (isTexOp(MI.getOpcode())) { - EmitTexInstr(MI, Fixups, OS); - } else if (isFCOp(MI.getOpcode())){ + if (isFCOp(MI.getOpcode())){ EmitFCInstr(MI, OS); } else if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::FETCH_CLAUSE || + MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || MI.getOpcode() == AMDGPU::KILL) { return; @@ -169,24 +168,136 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, case AMDGPU::TEX_VTX_TEXBUF : { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + InstWord2 |= 1 << 19; - EmitByte(INSTR_VTX, OS); + EmitByte(INSTR_NATIVE, OS); Emit(InstWord01, OS); + EmitByte(INSTR_NATIVE, OS); Emit(InstWord2, OS); + Emit((u_int32_t) 0, OS); + break; + } + case AMDGPU::TEX_LD: + case AMDGPU::TEX_GET_TEXTURE_RESINFO: + case AMDGPU::TEX_SAMPLE: + case AMDGPU::TEX_SAMPLE_C: + case AMDGPU::TEX_SAMPLE_L: + case AMDGPU::TEX_SAMPLE_C_L: + case AMDGPU::TEX_SAMPLE_LB: + case AMDGPU::TEX_SAMPLE_C_LB: + case AMDGPU::TEX_SAMPLE_G: + case AMDGPU::TEX_SAMPLE_C_G: + case AMDGPU::TEX_GET_GRADIENTS_H: + case AMDGPU::TEX_GET_GRADIENTS_V: + case AMDGPU::TEX_SET_GRADIENTS_H: + case AMDGPU::TEX_SET_GRADIENTS_V: { + unsigned Opcode = MI.getOpcode(); + bool HasOffsets = (Opcode == AMDGPU::TEX_LD); + unsigned OpOffset = HasOffsets ? 3 : 0; + int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); + int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); + + uint32_t SrcSelect[4] = {0, 1, 2, 3}; + uint32_t Offsets[3] = {0, 0, 0}; + uint64_t CoordType[4] = {1, 1, 1, 1}; + + if (HasOffsets) + for (unsigned i = 0; i < 3; i++) { + int SignedOffset = MI.getOperand(i + 2).getImm(); + Offsets[i] = (SignedOffset & 0x1F); + } + + + if (TextureType == TEXTURE_RECT || + TextureType == TEXTURE_SHADOWRECT) { + CoordType[ELEMENT_X] = 0; + CoordType[ELEMENT_Y] = 0; + } + + if (TextureType == TEXTURE_1D_ARRAY || + TextureType == TEXTURE_SHADOW1D_ARRAY) { + if (Opcode == AMDGPU::TEX_SAMPLE_C_L || + Opcode == AMDGPU::TEX_SAMPLE_C_LB) { + CoordType[ELEMENT_Y] = 0; + } else { + CoordType[ELEMENT_Z] = 0; + SrcSelect[ELEMENT_Z] = ELEMENT_Y; + } + } else if (TextureType == TEXTURE_2D_ARRAY || + TextureType == TEXTURE_SHADOW2D_ARRAY) { + CoordType[ELEMENT_Z] = 0; + } + + + if ((TextureType == TEXTURE_SHADOW1D || + TextureType == TEXTURE_SHADOW2D || + TextureType == TEXTURE_SHADOWRECT || + TextureType == TEXTURE_SHADOW1D_ARRAY) && + Opcode != AMDGPU::TEX_SAMPLE_C_L && + Opcode != AMDGPU::TEX_SAMPLE_C_LB) { + SrcSelect[ELEMENT_W] = ELEMENT_Z; + } + + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) | + CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 | + CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63; + uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | + SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | + SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | + Offsets[2] << 10; + + EmitByte(INSTR_NATIVE, OS); + Emit(Word01, OS); + EmitByte(INSTR_NATIVE, OS); + Emit(Word2, OS); + Emit((u_int32_t) 0, OS); break; } + case AMDGPU::CF_ALU: + case AMDGPU::CF_ALU_PUSH_BEFORE: { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_NATIVE, OS); + Emit(Inst, OS); + break; + } + case AMDGPU::CF_CALL_FS_EG: + case AMDGPU::CF_CALL_FS_R600: + return; + case AMDGPU::CF_TC_EG: + case AMDGPU::CF_VC_EG: + case AMDGPU::CF_TC_R600: + case AMDGPU::CF_VC_R600: + case AMDGPU::WHILE_LOOP_EG: + case AMDGPU::END_LOOP_EG: + case AMDGPU::LOOP_BREAK_EG: + case AMDGPU::CF_CONTINUE_EG: + case AMDGPU::CF_JUMP_EG: + case AMDGPU::CF_ELSE_EG: + case AMDGPU::POP_EG: + case AMDGPU::WHILE_LOOP_R600: + case AMDGPU::END_LOOP_R600: + case AMDGPU::LOOP_BREAK_R600: + case AMDGPU::CF_CONTINUE_R600: + case AMDGPU::CF_JUMP_R600: + case AMDGPU::CF_ELSE_R600: + case AMDGPU::POP_R600: case AMDGPU::EG_ExportSwz: case AMDGPU::R600_ExportSwz: case AMDGPU::EG_ExportBuf: - case AMDGPU::R600_ExportBuf: { + case AMDGPU::R600_ExportBuf: + case AMDGPU::PAD: + case AMDGPU::CF_END_R600: + case AMDGPU::CF_END_EG: + case AMDGPU::CF_END_CM: { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_EXPORT, OS); + EmitByte(INSTR_NATIVE, OS); Emit(Inst, OS); break; } - default: - EmitALUInstr(MI, Fixups, OS); + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_NATIVE, OS); + Emit(Inst, OS); break; } } @@ -320,7 +431,7 @@ void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, } if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned ImmOpIndex = MI.getNumOperands() - 1; + unsigned ImmOpIndex = MI.getNumOperands() - 2; MCOperand ImmOp = MI.getOperand(ImmOpIndex); if (ImmOp.isFPImm()) { InlineConstant.f = ImmOp.getFPImm(); @@ -334,99 +445,6 @@ void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, Emit(InlineConstant.i, OS); } -void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI, - SmallVectorImpl<MCFixup> &Fixups, - raw_ostream &OS) const { - - unsigned Opcode = MI.getOpcode(); - bool hasOffsets = (Opcode == AMDGPU::TEX_LD); - unsigned OpOffset = hasOffsets ? 3 : 0; - int64_t Resource = MI.getOperand(OpOffset + 2).getImm(); - int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); - int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); - unsigned srcSelect[4] = {0, 1, 2, 3}; - - // Emit instruction type - EmitByte(1, OS); - - // Emit instruction - EmitByte(getBinaryCodeForInstr(MI, Fixups), OS); - - // Emit resource id - EmitByte(Resource, OS); - - // Emit source register - EmitByte(getHWReg(MI.getOperand(1).getReg()), OS); - - // XXX: Emit src isRelativeAddress - EmitByte(0, OS); - - // Emit destination register - EmitByte(getHWReg(MI.getOperand(0).getReg()), OS); - - // XXX: Emit dst isRealtiveAddress - EmitByte(0, OS); - - // XXX: Emit dst select - EmitByte(0, OS); // X - EmitByte(1, OS); // Y - EmitByte(2, OS); // Z - EmitByte(3, OS); // W - - // XXX: Emit lod bias - EmitByte(0, OS); - - // XXX: Emit coord types - unsigned coordType[4] = {1, 1, 1, 1}; - - if (TextureType == TEXTURE_RECT - || TextureType == TEXTURE_SHADOWRECT) { - coordType[ELEMENT_X] = 0; - coordType[ELEMENT_Y] = 0; - } - - if (TextureType == TEXTURE_1D_ARRAY - || TextureType == TEXTURE_SHADOW1D_ARRAY) { - if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) { - coordType[ELEMENT_Y] = 0; - } else { - coordType[ELEMENT_Z] = 0; - srcSelect[ELEMENT_Z] = ELEMENT_Y; - } - } else if (TextureType == TEXTURE_2D_ARRAY - || TextureType == TEXTURE_SHADOW2D_ARRAY) { - coordType[ELEMENT_Z] = 0; - } - - for (unsigned i = 0; i < 4; i++) { - EmitByte(coordType[i], OS); - } - - // XXX: Emit offsets - if (hasOffsets) - for (unsigned i = 2; i < 5; i++) - EmitByte(MI.getOperand(i).getImm()<<1, OS); - else - EmitNullBytes(3, OS); - - // Emit sampler id - EmitByte(Sampler, OS); - - // XXX:Emit source select - if ((TextureType == TEXTURE_SHADOW1D - || TextureType == TEXTURE_SHADOW2D - || TextureType == TEXTURE_SHADOWRECT - || TextureType == TEXTURE_SHADOW1D_ARRAY) - && Opcode != AMDGPU::TEX_SAMPLE_C_L - && Opcode != AMDGPU::TEX_SAMPLE_C_LB) { - srcSelect[ELEMENT_W] = ELEMENT_Z; - } - - for (unsigned i = 0; i < 4; i++) { - EmitByte(srcSelect[i], OS); - } -} - void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const { // Emit instruction type diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index e27abcc..5af8320 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -39,8 +39,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; - const MCSubtargetInfo &STI; - MCContext &Ctx; /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -51,7 +49,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, const MCSubtargetInfo &sti, MCContext &ctx) - : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } + : MCII(mcii), MRI(mri) { } ~SIMCCodeEmitter() { } diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index 868810c..e024e66 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -13,18 +13,39 @@ class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> : Processor<Name, itin, Features>; -def : Proc<"", R600_EG_Itin, [FeatureR600ALUInst]>; -def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>; -def : Proc<"rv710", R600_EG_Itin, []>; -def : Proc<"rv730", R600_EG_Itin, []>; -def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>; -def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; -def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; -def : Proc<"SI", SI_Itin, [Feature64BitPtr]>; - +def : Proc<"", R600_VLIW5_Itin, + [FeatureR600ALUInst, FeatureVertexCache]>; +def : Proc<"r600", R600_VLIW5_Itin, + [FeatureR600ALUInst , FeatureVertexCache]>; +def : Proc<"rs880", R600_VLIW5_Itin, + [FeatureR600ALUInst]>; +def : Proc<"rv670", R600_VLIW5_Itin, + [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache]>; +def : Proc<"rv710", R600_VLIW5_Itin, + [FeatureVertexCache]>; +def : Proc<"rv730", R600_VLIW5_Itin, + [FeatureVertexCache]>; +def : Proc<"rv770", R600_VLIW5_Itin, + [FeatureFP64, FeatureVertexCache]>; +def : Proc<"cedar", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"redwood", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"sumo", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages]>; +def : Proc<"juniper", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"cypress", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache]>; +def : Proc<"barts", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"turks", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"caicos", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages]>; +def : Proc<"cayman", R600_VLIW4_Itin, + [FeatureByteAddress, FeatureImages, FeatureFP64]>;def : Proc<"SI", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"oland", SI_Itin, [Feature64BitPtr, FeatureFP64]>; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp new file mode 100644 index 0000000..0995795 --- /dev/null +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -0,0 +1,496 @@ +//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass compute turns all control flow pseudo instructions into native one +/// computing their address on the fly ; it also sets STACK_SIZE info. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "r600cf" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +namespace llvm { + +class R600ControlFlowFinalizer : public MachineFunctionPass { + +private: + typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + + enum ControlFlowInstruction { + CF_TC, + CF_VC, + CF_CALL_FS, + CF_WHILE_LOOP, + CF_END_LOOP, + CF_LOOP_BREAK, + CF_LOOP_CONTINUE, + CF_JUMP, + CF_ELSE, + CF_POP, + CF_END + }; + + static char ID; + const R600InstrInfo *TII; + const R600RegisterInfo &TRI; + unsigned MaxFetchInst; + const AMDGPUSubtarget &ST; + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + return true; + default: + return false; + } + } + + const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { + unsigned Opcode = 0; + bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX); + switch (CFI) { + case CF_TC: + Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + break; + case CF_VC: + Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + break; + case CF_CALL_FS: + Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + break; + case CF_WHILE_LOOP: + Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + break; + case CF_END_LOOP: + Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + break; + case CF_LOOP_BREAK: + Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + break; + case CF_LOOP_CONTINUE: + Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + break; + case CF_JUMP: + Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + break; + case CF_ELSE: + Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + break; + case CF_POP: + Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + break; + case CF_END: + if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) { + Opcode = AMDGPU::CF_END_CM; + break; + } + Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + break; + } + assert (Opcode && "No opcode selected"); + return TII->get(Opcode); + } + + bool isCompatibleWithClause(const MachineInstr *MI, + std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const { + unsigned DstMI, SrcMI; + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg()) + continue; + if (MO.isDef()) + DstMI = MO.getReg(); + if (MO.isUse()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + SrcMI = Reg; + else + SrcMI = TRI.getMatchingSuperReg(Reg, + TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + } + if ((DstRegs.find(SrcMI) == DstRegs.end()) && + (SrcRegs.find(DstMI) == SrcRegs.end())) { + SrcRegs.insert(SrcMI); + DstRegs.insert(DstMI); + return true; + } else + return false; + } + + ClauseFile + MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + unsigned AluInstCount = 0; + bool IsTex = TII->usesTextureCache(ClauseHead); + std::set<unsigned> DstRegs, SrcRegs; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (AluInstCount > MaxFetchInst) + break; + if ((IsTex && !TII->usesTextureCache(I)) || + (!IsTex && !TII->usesVertexCache(I))) + break; + if (!isCompatibleWithClause(I, DstRegs, SrcRegs)) + break; + AluInstCount ++; + ClauseContent.push_back(I); + } + MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), + getHWInstrDesc(IsTex?CF_TC:CF_VC)) + .addImm(0) // ADDR + .addImm(AluInstCount - 1); // COUNT + return ClauseFile(MIb, ClauseContent); + } + + void getLiteral(MachineInstr *MI, std::vector<unsigned> &Lits) const { + unsigned LiteralRegs[] = { + AMDGPU::ALU_LITERAL_X, + AMDGPU::ALU_LITERAL_Y, + AMDGPU::ALU_LITERAL_Z, + AMDGPU::ALU_LITERAL_W + }; + for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.getReg() != AMDGPU::ALU_LITERAL_X) + continue; + unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM); + int64_t Imm = MI->getOperand(ImmIdx).getImm(); + std::vector<unsigned>::iterator It = + std::find(Lits.begin(), Lits.end(), Imm); + if (It != Lits.end()) { + unsigned Index = It - Lits.begin(); + MO.setReg(LiteralRegs[Index]); + } else { + assert(Lits.size() < 4 && "Too many literals in Instruction Group"); + MO.setReg(LiteralRegs[Lits.size()]); + Lits.push_back(Imm); + } + } + } + + MachineBasicBlock::iterator insertLiterals( + MachineBasicBlock::iterator InsertPos, + const std::vector<unsigned> &Literals) const { + MachineBasicBlock *MBB = InsertPos->getParent(); + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned LiteralPair0 = Literals[i]; + unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; + InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(LiteralPair0) + .addImm(LiteralPair1); + } + return InsertPos; + } + + ClauseFile + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + I++; + for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { + if (IsTrivialInst(I)) { + ++I; + continue; + } + if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) + break; + std::vector<unsigned> Literals; + if (I->isBundle()) { + MachineInstr *DeleteMI = I; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + while (++BI != E && BI->isBundledWithPred()) { + BI->unbundleFromPred(); + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = BI->getOperand(i); + if (MO.isReg() && MO.isInternalRead()) + MO.setIsInternalRead(false); + } + getLiteral(BI, Literals); + ClauseContent.push_back(BI); + } + I = BI; + DeleteMI->eraseFromParent(); + } else { + getLiteral(I, Literals); + ClauseContent.push_back(I); + I++; + } + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned literal0 = Literals[i]; + unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; + MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(literal0) + .addImm(literal2); + ClauseContent.push_back(MILit); + } + } + ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(ClauseHead, ClauseContent); + } + + void + EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += 2 * Clause.second.size(); + } + + void + EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += Clause.second.size(); + } + + void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { + MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + } + void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) + const { + for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); + It != E; ++It) { + MachineInstr *MI = *It; + CounterPropagateAddr(MI, Addr); + } + } + + unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { + switch (ST.device()->getGeneration()) { + case AMDGPUDeviceInfo::HD4XXX: + if (hasPush) + StackSubEntry += 2; + break; + case AMDGPUDeviceInfo::HD5XXX: + if (hasPush) + StackSubEntry ++; + case AMDGPUDeviceInfo::HD6XXX: + StackSubEntry += 2; + break; + } + return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 + } + +public: + R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())), + TRI(TII->getRegisterInfo()), + ST(tm.getSubtarget<AMDGPUSubtarget>()) { + const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX) + MaxFetchInst = 8; + else + MaxFetchInst = 16; + } + + virtual bool runOnMachineFunction(MachineFunction &MF) { + unsigned MaxStack = 0; + unsigned CurrentStack = 0; + bool hasPush; + for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; + ++MB) { + MachineBasicBlock &MBB = *MB; + unsigned CfCount = 0; + std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; + std::vector<MachineInstr * > IfThenElseStack; + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + if (MFI->ShaderType == 1) { + BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), + getHWInstrDesc(CF_CALL_FS)); + CfCount++; + } + std::vector<ClauseFile> FetchClauses, AluClauses; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E;) { + if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + DEBUG(dbgs() << CfCount << ":"; I->dump();); + FetchClauses.push_back(MakeFetchClause(MBB, I)); + CfCount++; + continue; + } + + MachineBasicBlock::iterator MI = I; + I++; + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU_PUSH_BEFORE: + CurrentStack++; + MaxStack = std::max(MaxStack, CurrentStack); + hasPush = true; + case AMDGPU::CF_ALU: + I = MI; + AluClauses.push_back(MakeALUClause(MBB, I)); + case AMDGPU::EG_ExportBuf: + case AMDGPU::EG_ExportSwz: + case AMDGPU::R600_ExportBuf: + case AMDGPU::R600_ExportSwz: + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + break; + case AMDGPU::WHILELOOP: { + CurrentStack+=4; + MaxStack = std::max(MaxStack, CurrentStack); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_WHILE_LOOP)) + .addImm(1); + std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, + std::set<MachineInstr *>()); + Pair.second.insert(MIb); + LoopStack.push_back(Pair); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDLOOP: { + CurrentStack-=4; + std::pair<unsigned, std::set<MachineInstr *> > Pair = + LoopStack.back(); + LoopStack.pop_back(); + CounterPropagateAddr(Pair.second, CfCount); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) + .addImm(Pair.first + 1); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::IF_PREDICATE_SET: { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_JUMP)) + .addImm(0) + .addImm(0); + IfThenElseStack.push_back(MIb); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ELSE: { + MachineInstr * JumpInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(JumpInst, CfCount); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_ELSE)) + .addImm(0) + .addImm(1); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + IfThenElseStack.push_back(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDIF: { + CurrentStack--; + MachineInstr *IfOrElseInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(IfOrElseInst, CfCount + 1); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::PREDICATED_BREAK: { + CurrentStack--; + CfCount += 3; + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) + .addImm(CfCount) + .addImm(1); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_BREAK)) + .addImm(0); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP)) + .addImm(CfCount) + .addImm(1); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + break; + } + case AMDGPU::CONTINUE: { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_CONTINUE)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::RETURN: { + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + CfCount++; + MI->eraseFromParent(); + if (CfCount % 2) { + BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + CfCount++; + } + for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) + EmitFetchClause(I, FetchClauses[i], CfCount); + for (unsigned i = 0, e = AluClauses.size(); i < e; i++) + EmitALUClause(I, AluClauses[i], CfCount); + } + default: + break; + } + } + MFI->StackSize = getHWStackSize(MaxStack, hasPush); + } + + return false; + } + + const char *getPassName() const { + return "R600 Control Flow Finalizer Pass"; + } +}; + +char R600ControlFlowFinalizer::ID = 0; + +} + + +llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { + return new R600ControlFlowFinalizer(TM); +} diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 16cfcf5..303ca73 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -39,7 +39,9 @@ namespace R600_InstFlag { //FlagOperand bits 7, 8 NATIVE_OPERANDS = (1 << 9), OP1 = (1 << 10), - OP2 = (1 << 11) + OP2 = (1 << 11), + VTX_INST = (1 << 12), + TEX_INST = (1 << 13) }; } @@ -78,6 +80,7 @@ namespace R600Operands { LAST, PRED_SEL, IMM, + BANK_SWIZZLE, COUNT }; @@ -85,11 +88,11 @@ namespace R600Operands { // W C S S S S S S S S S S S // R O D L S R R R R S R R R R S R R R L P // D U I M R A R C C C C R C C C C R C C C A R I -// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M -// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M - {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12}, - {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19}, - {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} +// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M B +// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M S + {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12,13}, + {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19,20}, + {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18} }; } diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp new file mode 100644 index 0000000..3fdc678 --- /dev/null +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -0,0 +1,255 @@ +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold +/// 128 Alu instructions ; these instructions can access up to 4 prefetched +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are +/// initiated by CF_ALU instructions. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +namespace llvm { + +class R600EmitClauseMarkersPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned OccupiedDwords(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT4_eg_pseudo: + case AMDGPU::DOT4_r600_pseudo: + return 4; + case AMDGPU::KILL: + return 0; + default: + break; + } + + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode())) + return 4; + + unsigned NumLiteral = 0; + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++NumLiteral; + } + return 1 + NumLiteral; + } + + bool isALU(const MachineInstr *MI) const { + if (TII->isALUInstr(MI->getOpcode())) + return true; + if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT4_eg_pseudo: + case AMDGPU::DOT4_r600_pseudo: + return true; + default: + return false; + } + } + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + return true; + default: + return false; + } + } + + // Register Idx, then Const value + std::vector<std::pair<unsigned, unsigned> > ExtractConstRead(MachineInstr *MI) + const { + const R600Operands::Ops OpTable[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL}, + }; + std::vector<std::pair<unsigned, unsigned> > Result; + + if (!TII->isALUInstr(MI->getOpcode())) + return Result; + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[j][0]); + if (SrcIdx < 0) + break; + if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) { + unsigned Const = MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); + Result.push_back(std::pair<unsigned, unsigned>(SrcIdx, Const)); + } + } + return Result; + } + + std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { + // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 + // (See also R600ISelLowering.cpp) + // ConstIndex value is in [0, 4095]; + return std::pair<unsigned, unsigned>( + ((Sel >> 2) - 512) >> 12, // KC_BANK + // Line Number of ConstIndex + // A line contains 16 constant registers however KCX bank can lock + // two line at the same time ; thus we want to get an even line number. + // Line number can be retrieved with (>>4), using (>>5) <<1 generates + // an even number. + ((((Sel >> 2) - 512) & 4095) >> 5) << 1); + } + + bool SubstituteKCacheBank(MachineInstr *MI, + std::vector<std::pair<unsigned, unsigned> > &CachedConsts) const { + std::vector<std::pair<unsigned, unsigned> > UsedKCache; + std::vector<std::pair<unsigned, unsigned> > Consts = ExtractConstRead(MI); + assert(TII->isALUInstr(MI->getOpcode()) && "Can't assign Const"); + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + unsigned Sel = Consts[i].second; + unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; + unsigned KCacheIndex = Index * 4 + Chan; + const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); + if (CachedConsts.empty()) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); + continue; + } + if (CachedConsts[0] == BankLine) { + UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); + continue; + } + if (CachedConsts.size() == 1) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); + continue; + } + if (CachedConsts[1] == BankLine) { + UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); + continue; + } + return false; + } + + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + switch(UsedKCache[i].first) { + case 0: + MI->getOperand(Consts[i].first).setReg( + AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[i].second)); + break; + case 1: + MI->getOperand(Consts[i].first).setReg( + AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[i].second)); + break; + default: + llvm_unreachable("Wrong Cache Line"); + } + } + return true; + } + + MachineBasicBlock::iterator + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<std::pair<unsigned, unsigned> > KCacheBanks; + bool PushBeforeModifier = false; + unsigned AluInstCount = 0; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (!isALU(I)) + break; + if (AluInstCount > TII->getMaxAlusPerClause()) + break; + if (I->getOpcode() == AMDGPU::PRED_X) { + if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) + PushBeforeModifier = true; + AluInstCount ++; + continue; + } + if (I->getOpcode() == AMDGPU::KILLGT) { + I++; + break; + } + if (TII->isALUInstr(I->getOpcode()) && + !SubstituteKCacheBank(I, KCacheBanks)) + break; + AluInstCount += OccupiedDwords(I); + } + unsigned Opcode = PushBeforeModifier ? + AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; + BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) + .addImm(0) // ADDR + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 + .addImm(KCacheBanks.empty()?0:2) // KM0 + .addImm((KCacheBanks.size() < 2)?0:2) // KM1 + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 + .addImm(AluInstCount); // COUNT + return I; + } + +public: + R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID), + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { } + + virtual bool runOnMachineFunction(MachineFunction &MF) { + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + if (I->getOpcode() == AMDGPU::CF_ALU) + continue; // BB was already parsed + for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { + if (isALU(I)) + I = MakeALUClause(MBB, I); + else + ++I; + } + } + return false; + } + + const char *getPassName() const { + return "R600 Emit Clause Markers Pass"; + } +}; + +char R600EmitClauseMarkersPass::ID = 0; + +} + + +llvm::FunctionPass *llvm::createR600EmitClauseMarkers(TargetMachine &TM) { + return new R600EmitClauseMarkersPass(TM); +} + diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a73691d..a66baca 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -28,7 +28,6 @@ using namespace llvm; R600TargetLowering::R600TargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM), TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) { - setOperationAction(ISD::MUL, MVT::i64, Expand); addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); @@ -58,7 +57,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::FPOW, MVT::f32, Custom); setOperationAction(ISD::ROTL, MVT::i32, Custom); @@ -95,6 +93,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::VLIW); } @@ -316,7 +315,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); - case ISD::FPOW: return LowerFPOW(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); @@ -918,15 +916,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(Ops, 2, DL); } -SDValue R600TargetLowering::LowerFPOW(SDValue Op, - SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0)); - SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase); - return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase); -} - /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index 5cb4b91..2c09acb 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -59,7 +59,6 @@ private: SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 0865098..8fd8385 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "R600InstrInfo.h" +#include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "R600Defines.h" @@ -29,7 +30,8 @@ using namespace llvm; R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) : AMDGPUInstrInfo(tm), - RI(tm, *this) + RI(tm, *this), + ST(tm.getSubtarget<AMDGPUSubtarget>()) { } const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { @@ -139,6 +141,34 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::OP3)); } +bool R600InstrInfo::isTransOnly(unsigned Opcode) const { + return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY); +} + +bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { + return isTransOnly(MI->getOpcode()); +} + +bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { + return ST.hasVertexCache() && get(Opcode).TSFlags & R600_InstFlag::VTX_INST; +} + +bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { + const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>(); + return MFI->ShaderType != ShaderType::COMPUTE && usesVertexCache(MI->getOpcode()); +} + +bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { + return (!ST.hasVertexCache() && get(Opcode).TSFlags & R600_InstFlag::VTX_INST) || + (get(Opcode).TSFlags & R600_InstFlag::TEX_INST); +} + +bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { + const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>(); + return (MFI->ShaderType == ShaderType::COMPUTE && usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); +} + bool R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) const { @@ -183,10 +213,19 @@ R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const { int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); if (SrcIdx < 0) break; - if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) { + unsigned Reg = MI->getOperand(SrcIdx).getReg(); + if (Reg == AMDGPU::ALU_CONST) { unsigned Const = MI->getOperand( getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); Consts.push_back(Const); + continue; + } + if (AMDGPU::R600_KC0RegClass.contains(Reg) || + AMDGPU::R600_KC1RegClass.contains(Reg)) { + unsigned Index = RI.getEncodingValue(Reg) & 0xff; + unsigned Chan = RI.getHWRegChan(Reg); + Consts.push_back((Index << 2) | Chan); + continue; } } } @@ -645,6 +684,9 @@ const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const { return &AMDGPU::IndirectRegRegClass; } +unsigned R600InstrInfo::getMaxAlusPerClause() const { + return 115; +} MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -681,7 +723,8 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB //scheduling to the backend, we can change the default to 0. MIB.addImm(1) // $last .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel - .addImm(0); // $literal + .addImm(0) // $literal + .addImm(0); // $bank_swizzle return MIB; } diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index bf9569e..babe4b8 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -33,6 +33,7 @@ namespace llvm { class R600InstrInfo : public AMDGPUInstrInfo { private: const R600RegisterInfo RI; + const AMDGPUSubtarget &ST; int getBranchInstr(const MachineOperand &op) const; @@ -53,6 +54,14 @@ namespace llvm { /// \returns true if this \p Opcode represents an ALU instruction. bool isALUInstr(unsigned Opcode) const; + bool isTransOnly(unsigned Opcode) const; + bool isTransOnly(const MachineInstr *MI) const; + + bool usesVertexCache(unsigned Opcode) const; + bool usesVertexCache(const MachineInstr *MI) const; + bool usesTextureCache(unsigned Opcode) const; + bool usesTextureCache(const MachineInstr *MI) const; + bool fitsConstReadLimitations(const std::vector<unsigned>&) const; bool canBundle(const std::vector<MachineInstr *> &) const; @@ -145,6 +154,7 @@ namespace llvm { virtual const TargetRegisterClass *getSuperIndirectRegClass() const; + unsigned getMaxAlusPerClause() const; ///buildDefaultInstruction - This function returns a MachineInstr with /// all the instruction modifiers initialized to their default values. diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 8c50d54..1060b0a 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -13,11 +13,12 @@ include "R600Intrinsics.td" -class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, +class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin> : AMDGPUInst <outs, ins, asm, pattern> { field bits<64> Inst; + bit TransOnly = 0; bit Trig = 0; bit Op3 = 0; bit isVector = 0; @@ -25,9 +26,9 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, bit Op1 = 0; bit Op2 = 0; bit HasNativeOperands = 0; + bit VTXInst = 0; + bit TEXInst = 0; - bits<11> op_code = inst; - //let Inst = inst; let Namespace = "AMDGPU"; let OutOperandList = outs; let InOperandList = ins; @@ -35,6 +36,7 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, let Pattern = pattern; let Itinerary = itin; + let TSFlags{0} = TransOnly; let TSFlags{4} = Trig; let TSFlags{5} = Op3; @@ -45,11 +47,12 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, let TSFlags{9} = HasNativeOperands; let TSFlags{10} = Op1; let TSFlags{11} = Op2; + let TSFlags{12} = VTXInst; + let TSFlags{13} = TEXInst; } class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : - AMDGPUInst <outs, ins, asm, pattern> { - field bits<64> Inst; + InstR600 <outs, ins, asm, pattern, NullALU> { let Namespace = "AMDGPU"; } @@ -74,6 +77,9 @@ class InstFlag<string PM = "printOperand", int Default = 0> def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> { let PrintMethod = "printSel"; } +def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printSel"; +} def LITERAL : InstFlag<"printLiteral">; @@ -137,7 +143,7 @@ class R600ALU_Word1 { field bits<32> Word1; bits<11> dst; - bits<3> bank_swizzle = 0; + bits<3> bank_swizzle; bits<1> dst_rel; bits<1> clamp; @@ -234,6 +240,80 @@ class VTX_WORD1_GPR { let Word1{31} = SRF_MODE_ALL; } +class TEX_WORD0 { + field bits<32> Word0; + + bits<5> TEX_INST; + bits<2> INST_MOD; + bits<1> FETCH_WHOLE_QUAD; + bits<8> RESOURCE_ID; + bits<7> SRC_GPR; + bits<1> SRC_REL; + bits<1> ALT_CONST; + bits<2> RESOURCE_INDEX_MODE; + bits<2> SAMPLER_INDEX_MODE; + + let Word0{4-0} = TEX_INST; + let Word0{6-5} = INST_MOD; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = RESOURCE_ID; + let Word0{22-16} = SRC_GPR; + let Word0{23} = SRC_REL; + let Word0{24} = ALT_CONST; + let Word0{26-25} = RESOURCE_INDEX_MODE; + let Word0{28-27} = SAMPLER_INDEX_MODE; +} + +class TEX_WORD1 { + field bits<32> Word1; + + bits<7> DST_GPR; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<7> LOD_BIAS; + bits<1> COORD_TYPE_X; + bits<1> COORD_TYPE_Y; + bits<1> COORD_TYPE_Z; + bits<1> COORD_TYPE_W; + + let Word1{6-0} = DST_GPR; + let Word1{7} = DST_REL; + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{27-21} = LOD_BIAS; + let Word1{28} = COORD_TYPE_X; + let Word1{29} = COORD_TYPE_Y; + let Word1{30} = COORD_TYPE_Z; + let Word1{31} = COORD_TYPE_W; +} + +class TEX_WORD2 { + field bits<32> Word2; + + bits<5> OFFSET_X; + bits<5> OFFSET_Y; + bits<5> OFFSET_Z; + bits<5> SAMPLER_ID; + bits<3> SRC_SEL_X; + bits<3> SRC_SEL_Y; + bits<3> SRC_SEL_Z; + bits<3> SRC_SEL_W; + + let Word2{4-0} = OFFSET_X; + let Word2{9-5} = OFFSET_Y; + let Word2{14-10} = OFFSET_Z; + let Word2{19-15} = SAMPLER_ID; + let Word2{22-20} = SRC_SEL_X; + let Word2{25-23} = SRC_SEL_Y; + let Word2{28-26} = SRC_SEL_Z; + let Word2{31-29} = SRC_SEL_W; +} + /* XXX: R600 subtarget uses a slightly different encoding than the other subtargets. We currently handle this in R600MCCodeEmitter, but we may @@ -272,14 +352,14 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { // and R600InstrInfo::getOperandIdx(). class R600_1OP <bits<11> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <0, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), - !strconcat(opName, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$clamp $dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " "$literal $pred_sel$last"), pattern, itin>, @@ -311,17 +391,17 @@ class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, // R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). class R600_2OP <bits<11> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <inst, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), - !strconcat(opName, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " - "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " "$literal $pred_sel$last"), pattern, itin>, @@ -349,17 +429,17 @@ class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, // R600InstrInfo::getOperandIdx(). class R600_3OP <bits<5> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <0, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), (ins REL:$dst_rel, CLAMP:$clamp, R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), - !strconcat(opName, "$clamp $dst$dst_rel, " - "$src0_neg$src0$src0_sel$src0_rel, " - "$src1_neg$src1$src1_sel$src1_rel, " - "$src2_neg$src2$src2_sel$src2_rel, " + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$clamp $dst$dst_rel, " + "$src0_neg$src0$src0_rel, " + "$src1_neg$src1$src1_rel, " + "$src2_neg$src2$src2_rel, " "$literal $pred_sel$last"), pattern, itin>, @@ -376,8 +456,7 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern, class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, InstrItinClass itin = VecALU> : - InstR600 <inst, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), ins, asm, pattern, @@ -385,13 +464,35 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, class R600_TEX <bits<11> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <inst, - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - !strconcat(opName, "$dst, $src0, $resourceId, $samplerId, $textureTarget"), + InstR600 <(outs R600_Reg128:$DST_GPR), + (ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget), + !strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"), pattern, - itin>{ - let Inst {10-0} = inst; + itin>, TEX_WORD0, TEX_WORD1, TEX_WORD2 { + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let TEX_INST = inst{4-0}; + let SRC_REL = 0; + let DST_REL = 0; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let LOD_BIAS = 0; + + let INST_MOD = 0; + let FETCH_WHOLE_QUAD = 0; + let ALT_CONST = 0; + let SAMPLER_INDEX_MODE = 0; + let RESOURCE_INDEX_MODE = 0; + + let COORD_TYPE_X = 0; + let COORD_TYPE_Y = 0; + let COORD_TYPE_Z = 0; + let COORD_TYPE_W = 0; + + let TEXInst = 1; } } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 @@ -644,7 +745,9 @@ multiclass SteamOutputExportPattern<Instruction ExportInst, 4095, imm:$mask, buf3inst, 0)>; } -let usesCustomInserter = 1 in { +// Export Instructions should not be duplicated by TailDuplication pass +// (which assumes that duplicable instruction are affected by exec mask) +let usesCustomInserter = 1, isNotDuplicable = 1 in { class ExportSwzInst : InstR600ISA<( outs), @@ -671,6 +774,197 @@ class ExportBufInst : InstR600ISA<( let Inst{63-32} = Word1; } +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + +class CF_ALU_WORD0 { + field bits<32> Word0; + + bits<22> ADDR; + bits<4> KCACHE_BANK0; + bits<4> KCACHE_BANK1; + bits<2> KCACHE_MODE0; + + let Word0{21-0} = ADDR; + let Word0{25-22} = KCACHE_BANK0; + let Word0{29-26} = KCACHE_BANK1; + let Word0{31-30} = KCACHE_MODE0; +} + +class CF_ALU_WORD1 { + field bits<32> Word1; + + bits<2> KCACHE_MODE1; + bits<8> KCACHE_ADDR0; + bits<8> KCACHE_ADDR1; + bits<7> COUNT; + bits<1> ALT_CONST; + bits<4> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{1-0} = KCACHE_MODE1; + let Word1{9-2} = KCACHE_ADDR0; + let Word1{17-10} = KCACHE_ADDR1; + let Word1{24-18} = COUNT; + let Word1{25} = ALT_CONST; + let Word1{29-26} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs), +(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, i32imm:$KCACHE_MODE0, i32imm:$KCACHE_MODE1, +i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT), +!strconcat(OpName, " $COUNT, @$ADDR, " +"KC0[CB$KCACHE_BANK0:$KCACHE_ADDR0-$KCACHE_ADDR0+32]" +", KC1[CB$KCACHE_BANK1:$KCACHE_ADDR1-$KCACHE_ADDR1+32]"), +[] >, CF_ALU_WORD0, CF_ALU_WORD1 { + field bits<64> Inst; + + let CF_INST = inst; + let ALT_CONST = 0; + let WHOLE_QUAD_MODE = 0; + let BARRIER = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_WORD0_R600 { + field bits<32> Word0; + + bits<32> ADDR; + + let Word0 = ADDR; +} + +class CF_WORD1_R600 { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<3> COUNT; + bits<6> CALL_COUNT; + bits<1> COUNT_3; + bits<1> END_OF_PROGRAM; + bits<1> VALID_PIXEL_MODE; + bits<7> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{12-10} = COUNT; + let Word1{18-13} = CALL_COUNT; + let Word1{19} = COUNT_3; + let Word1{21} = END_OF_PROGRAM; + let Word1{22} = VALID_PIXEL_MODE; + let Word1{29-23} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { + field bits<64> Inst; + + let CF_INST = inst; + let BARRIER = 1; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let CALL_COUNT = 0; + let COUNT_3 = 0; + let END_OF_PROGRAM = 0; + let WHOLE_QUAD_MODE = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_WORD0_EG { + field bits<32> Word0; + + bits<24> ADDR; + bits<3> JUMPTABLE_SEL; + + let Word0{23-0} = ADDR; + let Word0{26-24} = JUMPTABLE_SEL; +} + +class CF_WORD1_EG { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<6> COUNT; + bits<1> VALID_PIXEL_MODE; + bits<1> END_OF_PROGRAM; + bits<8> CF_INST; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{15-10} = COUNT; + let Word1{20} = VALID_PIXEL_MODE; + let Word1{21} = END_OF_PROGRAM; + let Word1{29-22} = CF_INST; + let Word1{31} = BARRIER; +} + +class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { + field bits<64> Inst; + + let CF_INST = inst; + let BARRIER = 1; + let JUMPTABLE_SEL = 0; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let END_OF_PROGRAM = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +def CF_ALU : ALU_CLAUSE<8, "ALU">; +def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; + +def FETCH_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; +} + +def ALU_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; +} + +def LITERALS : AMDGPUInst <(outs), +(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + field bits<64> Inst; + bits<32> literal1; + bits<32> literal2; + + let Inst{31-0} = literal1; + let Inst{63-32} = literal2; +} + +def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { + field bits<64> Inst; +} + let Predicates = [isR600toCayman] in { //===----------------------------------------------------------------------===// @@ -689,58 +983,42 @@ def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. def SETE : R600_2OP < 0x08, "SETE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_EQ))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_EQ))] >; def SGT : R600_2OP < 0x09, "SETGT", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_GT))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GT))] >; def SGE : R600_2OP < 0xA, "SETGE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_GE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GE))] >; def SNE : R600_2OP < 0xB, "SETNE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_NE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_NE))] >; def SETE_DX10 : R600_2OP < 0xC, "SETE_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_EQ))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_EQ))] >; def SETGT_DX10 : R600_2OP < 0xD, "SETGT_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_GT))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GT))] >; def SETGE_DX10 : R600_2OP < 0xE, "SETGE_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_GE))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GE))] >; def SETNE_DX10 : R600_2OP < 0xF, "SETNE_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_NE))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_NE))] >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; @@ -798,38 +1076,32 @@ def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>; def SETE_INT : R600_2OP < 0x3A, "SETE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] >; def SETGT_INT : R600_2OP < 0x3B, "SETGT_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] >; def SETGE_INT : R600_2OP < 0x3C, "SETGE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] >; def SETNE_INT : R600_2OP < 0x3D, "SETNE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] >; def SETGT_UINT : R600_2OP < 0x3E, "SETGT_UINT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] >; def SETGE_UINT : R600_2OP < 0x3F, "SETGE_UINT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] >; def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; @@ -839,26 +1111,17 @@ def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; def CNDE_INT : R600_3OP < 0x1C, "CNDE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), 0, - (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), - COND_EQ))] + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] >; def CNDGE_INT : R600_3OP < 0x1E, "CNDGE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), 0, - (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), - COND_GE))] + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GE))] >; def CNDGT_INT : R600_3OP < 0x1D, "CNDGT_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), 0, - (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), - COND_GT))] + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GT))] >; //===----------------------------------------------------------------------===// @@ -867,25 +1130,33 @@ def CNDGT_INT : R600_3OP < def TEX_LD : R600_TEX < 0x03, "TEX_LD", - [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_txf v4f32:$SRC_GPR, + imm:$OFFSET_X, imm:$OFFSET_Y, imm:$OFFSET_Z, imm:$RESOURCE_ID, + imm:$SAMPLER_ID, imm:$textureTarget))] > { -let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget"; -let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget); +let AsmString = "TEX_LD $DST_GPR, $SRC_GPR, $OFFSET_X, $OFFSET_Y, $OFFSET_Z," + "$RESOURCE_ID, $SAMPLER_ID, $textureTarget"; +let InOperandList = (ins R600_Reg128:$SRC_GPR, i32imm:$OFFSET_X, + i32imm:$OFFSET_Y, i32imm:$OFFSET_Z, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, + i32imm:$textureTarget); } def TEX_GET_TEXTURE_RESINFO : R600_TEX < 0x04, "TEX_GET_TEXTURE_RESINFO", - [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_txq v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_GET_GRADIENTS_H : R600_TEX < 0x07, "TEX_GET_GRADIENTS_H", - [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_ddx v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_GET_GRADIENTS_V : R600_TEX < 0x08, "TEX_GET_GRADIENTS_V", - [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_ddy v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SET_GRADIENTS_H : R600_TEX < @@ -900,32 +1171,38 @@ def TEX_SET_GRADIENTS_V : R600_TEX < def TEX_SAMPLE : R600_TEX < 0x10, "TEX_SAMPLE", - [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SAMPLE_C : R600_TEX < 0x18, "TEX_SAMPLE_C", - [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))] >; def TEX_SAMPLE_L : R600_TEX < 0x11, "TEX_SAMPLE_L", - [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SAMPLE_C_L : R600_TEX < 0x19, "TEX_SAMPLE_C_L", - [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))] >; def TEX_SAMPLE_LB : R600_TEX < 0x12, "TEX_SAMPLE_LB", - [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SAMPLE_C_LB : R600_TEX < 0x1A, "TEX_SAMPLE_C_LB", - [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] + [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))] >; def TEX_SAMPLE_G : R600_TEX < @@ -954,32 +1231,22 @@ class MULADD_Common <bits<5> inst> : R600_3OP < class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < inst, "MULADD_IEEE", - [(set (f32 R600_Reg32:$dst), - (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))] + [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] >; class CNDE_Common <bits<5> inst> : R600_3OP < inst, "CNDE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), FP_ZERO, - (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), - COND_EQ))] + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_EQ))] >; class CNDGT_Common <bits<5> inst> : R600_3OP < inst, "CNDGT", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), FP_ZERO, - (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), - COND_GT))] + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GT))] >; class CNDGE_Common <bits<5> inst> : R600_3OP < inst, "CNDGE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), FP_ZERO, - (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), - COND_GE))] + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))] >; multiclass DOT4_Common <bits<11> inst> { @@ -987,7 +1254,7 @@ multiclass DOT4_Common <bits<11> inst> { def _pseudo : R600_REDUCTION <inst, (ins R600_Reg128:$src0, R600_Reg128:$src1), "DOT4 $dst $src0, $src1", - [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))] + [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))] >; def _real : R600_2OP <inst, "DOT4", []>; @@ -997,11 +1264,10 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { multiclass CUBE_Common <bits<11> inst> { def _pseudo : InstR600 < - inst, (outs R600_Reg128:$dst), (ins R600_Reg128:$src), "CUBE $dst $src", - [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))], + [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src))], VecALU > { let isPseudo = 1; @@ -1013,23 +1279,38 @@ multiclass CUBE_Common <bits<11> inst> { class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper < inst, "EXP_IEEE", fexp2 ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper < inst, "FLT_TO_INT", fp_to_sint ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < inst, "INT_TO_FLT", sint_to_fp ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper < inst, "FLT_TO_UINT", fp_to_uint ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < inst, "UINT_TO_FLT", uint_to_fp ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < inst, "LOG_CLAMPED", [] @@ -1037,50 +1318,84 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < inst, "LOG_IEEE", flog2 ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>; class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>; class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>; class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper < inst, "MULHI_INT", mulhs ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper < inst, "MULHI", mulhu ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper < inst, "MULLO_INT", mul ->; -class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < inst, "RECIP_CLAMPED", [] ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < - inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))] ->; + inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < inst, "RECIP_UINT", AMDGPUurecip ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP < inst, "RECIPSQRT_IEEE", [] ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class SIN_Common <bits<11> inst> : R600_1OP < inst, "SIN", []>{ let Trig = 1; + let TransOnly = 1; + let Itinerary = TransALU; } class COS_Common <bits<11> inst> : R600_1OP < inst, "COS", []> { let Trig = 1; + let TransOnly = 1; + let Itinerary = TransALU; } //===----------------------------------------------------------------------===// @@ -1089,19 +1404,20 @@ class COS_Common <bits<11> inst> : R600_1OP < multiclass DIV_Common <InstR600 recip_ieee> { def : Pat< - (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), - (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (int_AMDGPU_div f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) >; def : Pat< - (fdiv R600_Reg32:$src0, R600_Reg32:$src1), - (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (fdiv f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) >; } -class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat < - (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w), - (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x)) +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> + : Pat < + (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) >; //===----------------------------------------------------------------------===// @@ -1141,13 +1457,13 @@ let Predicates = [isR600] in { def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; + def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; - def : Pat<(fsqrt R600_Reg32:$src), - (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>; + def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def R600_ExportSwz : ExportSwzInst { - let Word1{20-17} = 1; // BURST_COUNT + let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; let Word1{22} = 1; // VALID_PIXEL_MODE let Word1{30-23} = inst; @@ -1156,25 +1472,77 @@ let Predicates = [isR600] in { defm : ExportPattern<R600_ExportSwz, 39>; def R600_ExportBuf : ExportBufInst { - let Word1{20-17} = 1; // BURST_COUNT + let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; let Word1{22} = 1; // VALID_PIXEL_MODE let Word1{30-23} = inst; let Word1{31} = 1; // BARRIER } defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>; + + def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; + } + def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + } // Helper pattern for normalizing inputs to triginomic instructions for R700+ // cards. class COS_PAT <InstR600 trig> : Pat< - (fcos R600_Reg32:$src), - (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (fcos f32:$src), + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src)) >; class SIN_PAT <InstR600 trig> : Pat< - (fsin R600_Reg32:$src), - (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (fsin f32:$src), + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src)) >; //===----------------------------------------------------------------------===// @@ -1212,10 +1580,10 @@ def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; +def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; def : SIN_PAT <SIN_eg>; def : COS_PAT <COS_eg>; -def : Pat<(fsqrt R600_Reg32:$src), - (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>; +def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; } // End Predicates = [isEG] //===----------------------------------------------------------------------===// @@ -1239,15 +1607,16 @@ let Predicates = [isEGorCayman] in { // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", - [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0, - R600_Reg32:$src1, - R600_Reg32:$src2))], + [(set i32:$dst, (int_AMDIL_bit_extract_u32 i32:$src0, i32:$src1, + i32:$src2))], VecALU >; + def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>; + defm : BFIPatterns <BFI_INT_eg>; + def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", - [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1, - R600_Reg32:$src2))], + [(set i32:$dst, (AMDGPUbitalign i32:$src0, i32:$src1, i32:$src2))], VecALU >; @@ -1292,14 +1661,12 @@ let hasSideEffects = 1 in { // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, // which do not need to be truncated since the fp values are 0.0f or 1.0f. // We should look into handling these cases separately. - def : Pat<(fp_to_sint R600_Reg32:$src0), - (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>; + def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; - def : Pat<(fp_to_uint R600_Reg32:$src0), - (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>; + def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; def EG_ExportSwz : ExportSwzInst { - let Word1{19-16} = 1; // BURST_COUNT + let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE let Word1{21} = eop; let Word1{29-22} = inst; @@ -1309,7 +1676,7 @@ let hasSideEffects = 1 in { defm : ExportPattern<EG_ExportSwz, 83>; def EG_ExportBuf : ExportBufInst { - let Word1{19-16} = 1; // BURST_COUNT + let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE let Word1{21} = eop; let Word1{29-22} = inst; @@ -1318,6 +1685,57 @@ let hasSideEffects = 1 in { } defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>; + def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; + } + def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + //===----------------------------------------------------------------------===// // Memory read/write instructions //===----------------------------------------------------------------------===// @@ -1347,14 +1765,14 @@ class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name, def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg < (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), 0x1, "RAT_WRITE_CACHELESS_32_eg", - [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)] + [(global_store i32:$rw_gpr, i32:$index_gpr)] >; //128-bit store def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg < (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), 0xf, "RAT_WRITE_CACHELESS_128", - [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)] + [(global_store v4i32:$rw_gpr, i32:$index_gpr)] >; class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> @@ -1408,6 +1826,8 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> // VTX_WORD3 (Padding) // // Inst{127-96} = 0; + + let VTXInst = 1; } class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> @@ -1477,19 +1897,19 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> //===----------------------------------------------------------------------===// def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, - [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))] + [(set i32:$dst, (load_param_zexti8 ADDRVTX_READ:$ptr))] >; def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, - [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))] + [(set i32:$dst, (load_param_zexti16 ADDRVTX_READ:$ptr))] >; def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, - [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))] + [(set i32:$dst, (load_param ADDRVTX_READ:$ptr))] >; def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, - [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))] + [(set v4i32:$dst, (load_param ADDRVTX_READ:$ptr))] >; //===----------------------------------------------------------------------===// @@ -1498,17 +1918,17 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, // 8-bit reads def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, - [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))] + [(set i32:$dst, (zextloadi8_global ADDRVTX_READ:$ptr))] >; // 32-bit reads def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, - [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))] + [(set i32:$dst, (global_load ADDRVTX_READ:$ptr))] >; // 128-bit reads def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, - [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))] + [(set v4i32:$dst, (global_load ADDRVTX_READ:$ptr))] >; //===----------------------------------------------------------------------===// @@ -1517,7 +1937,7 @@ def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, //===----------------------------------------------------------------------===// def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, - [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))] + [(set i32:$dst, (constant_load ADDRVTX_READ:$ptr))] >; } @@ -1540,28 +1960,34 @@ def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; -def LOG_IEEE_ : LOG_IEEE_Common<0x83>; +def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 +def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; def : SIN_PAT <SIN_cm>; def : COS_PAT <COS_cm>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; // RECIP_UINT emulation for Cayman +// The multiplication scales from [0,1] to the unsigned integer range def : Pat < - (AMDGPUurecip R600_Reg32:$src0), - (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)), - (MOV_IMM_I32 0x4f800000))) + (AMDGPUurecip i32:$src0), + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), + (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) >; + def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { + let ADDR = 0; + let POP_COUNT = 0; + let COUNT = 0; + } -def : Pat<(fsqrt R600_Reg32:$src), - (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>; +def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; } // End isCayman @@ -1583,21 +2009,21 @@ def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src), let isPseudo = 1 in { def PRED_X : InstR600 < - 0, (outs R600_Predicate_Bit:$dst), + (outs R600_Predicate_Bit:$dst), (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), "", [], NullALU> { let FlagOperandIdx = 3; } let isTerminator = 1, isBranch = 1 in { -def JUMP_COND : InstR600 <0x10, +def JUMP_COND : InstR600 < (outs), (ins brtarget:$target, R600_Predicate_Bit:$p), "JUMP $target ($p)", [], AnyALU >; -def JUMP : InstR600 <0x10, +def JUMP : InstR600 < (outs), (ins brtarget:$target), "JUMP $target", @@ -1624,20 +2050,28 @@ def MASK_WRITE : AMDGPUShaderInst < } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 -def TXD: AMDGPUShaderInst < +def TXD: InstR600 < (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ->; + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + NullALU > { + let TEXInst = 1; +} -def TXD_SHADOW: AMDGPUShaderInst < +def TXD_SHADOW: InstR600 < (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] ->; - + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], + NullALU +> { + let TEXInst = 1; +} } // End isPseudo = 1 } // End usesCustomInserter = 1 @@ -1674,7 +2108,7 @@ def CONST_COPY : Instruction { def TEX_VTX_CONSTBUF : InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", - [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, VTX_WORD1_GPR, VTX_WORD0 { let VC_INST = 0; @@ -1723,11 +2157,12 @@ def TEX_VTX_CONSTBUF : // VTX_WORD3 (Padding) // // Inst{127-96} = 0; + let VTXInst = 1; } def TEX_VTX_TEXBUF: InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", - [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, + [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, VTX_WORD1_GPR, VTX_WORD0 { let VC_INST = 0; @@ -1776,6 +2211,7 @@ let Inst{63-32} = Word1; // VTX_WORD3 (Padding) // // Inst{127-96} = 0; + let VTXInst = 1; } @@ -1852,9 +2288,8 @@ let isTerminator=1 in { // CND*_INT Pattterns for f32 True / False values class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < - (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1), - R600_Reg32:$src2, cc), - (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) + (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), + (cnd $src0, $src1, $src2) >; def : CND_INT_f32 <CNDE_INT, SETEQ>; @@ -1863,9 +2298,8 @@ def : CND_INT_f32 <CNDGE_INT, SETGE>; //CNDGE_INT extra pattern def : Pat < - (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1), - (i32 R600_Reg32:$src2), COND_GT), - (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) + (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_GT), + (CNDGE_INT $src0, $src1, $src2) >; // KIL Patterns @@ -1875,56 +2309,56 @@ def KILP : Pat < >; def KIL : Pat < - (int_AMDGPU_kill R600_Reg32:$src0), - (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0))) + (int_AMDGPU_kill f32:$src0), + (MASK_WRITE (KILLGT (f32 ZERO), $src0)) >; // SGT Reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT), - (SGT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LT), + (SGT $src1, $src0) >; // SGE Reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE), - (SGE R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LE), + (SGE $src1, $src0) >; // SETGT_DX10 reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT), - (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, -1, 0, COND_LT), + (SETGT_DX10 $src1, $src0) >; // SETGE_DX10 reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE), - (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, -1, 0, COND_LE), + (SETGE_DX10 $src1, $src0) >; // SETGT_INT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT), - (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETLT), + (SETGT_INT $src1, $src0) >; // SETGE_INT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE), - (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETLE), + (SETGE_INT $src1, $src0) >; // SETGT_UINT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT), - (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETULT), + (SETGT_UINT $src1, $src0) >; // SETGE_UINT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE), - (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETULE), + (SETGE_UINT $src1, $src0) >; // The next two patterns are special cases for handling 'true if ordered' and @@ -1937,50 +2371,50 @@ def : Pat < //SETE - 'true if ordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO), - (SETE R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETO), + (SETE $src0, $src1) >; //SETE_DX10 - 'true if ordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO), - (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, -1, 0, SETO), + (SETE_DX10 $src0, $src1) >; //SNE - 'true if unordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO), - (SNE R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETUO), + (SNE $src0, $src1) >; //SETNE_DX10 - 'true if ordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO), - (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, -1, 0, SETUO), + (SETNE_DX10 $src0, $src1) >; -def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>; -def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>; -def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>; -def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>; +def : Extract_Element <f32, v4f32, 0, sub0>; +def : Extract_Element <f32, v4f32, 1, sub1>; +def : Extract_Element <f32, v4f32, 2, sub2>; +def : Extract_Element <f32, v4f32, 3, sub3>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>; +def : Insert_Element <f32, v4f32, 0, sub0>; +def : Insert_Element <f32, v4f32, 1, sub1>; +def : Insert_Element <f32, v4f32, 2, sub2>; +def : Insert_Element <f32, v4f32, 3, sub3>; -def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>; -def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>; -def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>; -def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>; +def : Extract_Element <i32, v4i32, 0, sub0>; +def : Extract_Element <i32, v4i32, 1, sub1>; +def : Extract_Element <i32, v4i32, 2, sub2>; +def : Extract_Element <i32, v4i32, 3, sub3>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>; +def : Insert_Element <i32, v4i32, 0, sub0>; +def : Insert_Element <i32, v4i32, 1, sub1>; +def : Insert_Element <i32, v4i32, 2, sub2>; +def : Insert_Element <i32, v4i32, 3, sub3>; -def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>; -def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>; +def : Vector4_Build <v4f32, f32>; +def : Vector4_Build <v4i32, i32>; // bitconvert patterns diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp index b07a585..018b403 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.cpp +++ b/lib/Target/R600/R600MachineFunctionInfo.cpp @@ -13,5 +13,6 @@ using namespace llvm; R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) - : MachineFunctionInfo() { - } + : AMDGPUMachineFunction(MF) { } + + diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index 13a46b8..70fddbb 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -14,18 +14,18 @@ #define R600MACHINEFUNCTIONINFO_H #include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "AMDGPUMachineFunction.h" #include <vector> namespace llvm { -class R600MachineFunctionInfo : public MachineFunctionInfo { - +class R600MachineFunctionInfo : public AMDGPUMachineFunction { public: R600MachineFunctionInfo(const MachineFunction &MF); SmallVector<unsigned, 4> LiveOuts; std::vector<unsigned> IndirectRegs; + unsigned StackSize; }; } // End llvm namespace diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 9074364..a777142 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -37,7 +37,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { CurInstKind = IDOther; CurEmitted = 0; OccupedSlotsMask = 15; - InstKindLimit[IDAlu] = 120; // 120 minus 8 for security + InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp new file mode 100644 index 0000000..05e96f1 --- /dev/null +++ b/lib/Target/R600/R600Packetizer.cpp @@ -0,0 +1,446 @@ +//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass implements instructions packetization for R600. It unsets isLast +/// bit of instructions inside a bundle and substitutes src register with +/// PreviousVector when applicable. +// +//===----------------------------------------------------------------------===// + +#ifndef R600PACKETIZER_CPP +#define R600PACKETIZER_CPP + +#define DEBUG_TYPE "packets" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "AMDGPU.h" +#include "R600InstrInfo.h" + +namespace llvm { + +class R600Packetizer : public MachineFunctionPass { + +public: + static char ID; + R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const { + return "R600 Packetizer"; + } + + bool runOnMachineFunction(MachineFunction &Fn); +}; +char R600Packetizer::ID = 0; + +class R600PacketizerList : public VLIWPacketizerList { + +private: + const R600InstrInfo *TII; + const R600RegisterInfo &TRI; + + enum BankSwizzle { + ALU_VEC_012 = 0, + ALU_VEC_021, + ALU_VEC_120, + ALU_VEC_102, + ALU_VEC_201, + ALU_VEC_210 + }; + + unsigned getSlot(const MachineInstr *MI) const { + return TRI.getHWRegChan(MI->getOperand(0).getReg()); + } + + std::vector<unsigned> getPreviousVector(MachineBasicBlock::iterator I) const { + std::vector<unsigned> Result; + I--; + if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) + return Result; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + if (I->isBundle()) + BI++; + while (BI->isBundledWithPred() && !TII->isPredicated(BI)) { + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600Operands::WRITE); + if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm()) + Result.push_back(BI->getOperand(0).getReg()); + BI++; + } + return Result; + } + + void substitutePV(MachineInstr *MI, const std::vector<unsigned> &PV) const { + R600Operands::Ops Ops[] = { + R600Operands::SRC0, + R600Operands::SRC1, + R600Operands::SRC2 + }; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0) + continue; + unsigned Src = MI->getOperand(OperandIdx).getReg(); + for (unsigned j = 0, e = PV.size(); j < e; j++) { + if (Src == PV[j]) { + unsigned Chan = TRI.getHWRegChan(Src); + unsigned PVReg; + switch (Chan) { + case 0: + PVReg = AMDGPU::PV_X; + break; + case 1: + PVReg = AMDGPU::PV_Y; + break; + case 2: + PVReg = AMDGPU::PV_Z; + break; + case 3: + PVReg = AMDGPU::PV_W; + break; + default: + llvm_unreachable("Invalid Chan"); + } + MI->getOperand(OperandIdx).setReg(PVReg); + break; + } + } + } + } +public: + // Ctor. + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, + MachineDominatorTree &MDT) + : VLIWPacketizerList(MF, MLI, MDT, true), + TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { } + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() { } + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) { + return false; + } + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(MachineInstr *MI) { + if (TII->isVector(*MI)) + return true; + if (!TII->isALUInstr(MI->getOpcode())) + return true; + if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY) + return true; + if (TII->isTransOnly(MI)) + return true; + return false; + } + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { + MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); + if (getSlot(MII) <= getSlot(MIJ)) + return false; + // Does MII and MIJ share the same pred_sel ? + int OpI = TII->getOperandIdx(MII->getOpcode(), R600Operands::PRED_SEL), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600Operands::PRED_SEL); + unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + if (PredI != PredJ) + return false; + if (SUJ->isSucc(SUI)) { + for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { + const SDep &Dep = SUJ->Succs[i]; + if (Dep.getSUnit() != SUI) + continue; + if (Dep.getKind() == SDep::Anti) + continue; + if (Dep.getKind() == SDep::Output) + if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) + continue; + return false; + } + } + return true; + } + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;} + + void setIsLastBit(MachineInstr *MI, unsigned Bit) const { + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600Operands::LAST); + MI->getOperand(LastOp).setImm(Bit); + } + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) { + CurrentPacketMIs.push_back(MI); + bool FitsConstLimits = TII->canBundle(CurrentPacketMIs); + DEBUG( + if (!FitsConstLimits) { + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Consts read limitations\n"; + }); + const std::vector<unsigned> &PV = getPreviousVector(MI); + bool FitsReadPortLimits = fitsReadPortLimitation(CurrentPacketMIs, PV); + DEBUG( + if (!FitsReadPortLimits) { + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Read port limitations\n"; + }); + bool isBundlable = FitsConstLimits && FitsReadPortLimits; + CurrentPacketMIs.pop_back(); + if (!isBundlable) { + endPacket(MI->getParent(), MI); + substitutePV(MI, getPreviousVector(MI)); + return VLIWPacketizerList::addToPacket(MI); + } + if (!CurrentPacketMIs.empty()) + setIsLastBit(CurrentPacketMIs.back(), 0); + substitutePV(MI, PV); + return VLIWPacketizerList::addToPacket(MI); + } +private: + std::vector<std::pair<int, unsigned> > + ExtractSrcs(const MachineInstr *MI, const std::vector<unsigned> &PV) const { + R600Operands::Ops Ops[] = { + R600Operands::SRC0, + R600Operands::SRC1, + R600Operands::SRC2 + }; + std::vector<std::pair<int, unsigned> > Result; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0){ + Result.push_back(std::pair<int, unsigned>(-1,0)); + continue; + } + unsigned Src = MI->getOperand(OperandIdx).getReg(); + if (std::find(PV.begin(), PV.end(), Src) != PV.end()) { + Result.push_back(std::pair<int, unsigned>(-1,0)); + continue; + } + unsigned Reg = TRI.getEncodingValue(Src) & 0xff; + if (Reg > 127) { + Result.push_back(std::pair<int, unsigned>(-1,0)); + continue; + } + unsigned Chan = TRI.getHWRegChan(Src); + Result.push_back(std::pair<int, unsigned>(Reg, Chan)); + } + return Result; + } + + std::vector<std::pair<int, unsigned> > + Swizzle(std::vector<std::pair<int, unsigned> > Src, + BankSwizzle Swz) const { + switch (Swz) { + case ALU_VEC_012: + break; + case ALU_VEC_021: + std::swap(Src[1], Src[2]); + break; + case ALU_VEC_102: + std::swap(Src[0], Src[1]); + break; + case ALU_VEC_120: + std::swap(Src[0], Src[1]); + std::swap(Src[0], Src[2]); + break; + case ALU_VEC_201: + std::swap(Src[0], Src[2]); + std::swap(Src[0], Src[1]); + break; + case ALU_VEC_210: + std::swap(Src[0], Src[2]); + break; + } + return Src; + } + + bool isLegal(const std::vector<MachineInstr *> &IG, + const std::vector<BankSwizzle> &Swz, + const std::vector<unsigned> &PV) const { + assert (Swz.size() == IG.size()); + int Vector[4][3]; + memset(Vector, -1, sizeof(Vector)); + for (unsigned i = 0, e = IG.size(); i < e; i++) { + const std::vector<std::pair<int, unsigned> > &Srcs = + Swizzle(ExtractSrcs(IG[i], PV), Swz[i]); + for (unsigned j = 0; j < 3; j++) { + const std::pair<int, unsigned> &Src = Srcs[j]; + if (Src.first < 0) + continue; + if (Vector[Src.second][j] < 0) + Vector[Src.second][j] = Src.first; + if (Vector[Src.second][j] != Src.first) + return false; + } + } + return true; + } + + bool recursiveFitsFPLimitation( + std::vector<MachineInstr *> IG, + const std::vector<unsigned> &PV, + std::vector<BankSwizzle> &SwzCandidate, + std::vector<MachineInstr *> CurrentlyChecked) + const { + if (!isLegal(CurrentlyChecked, SwzCandidate, PV)) + return false; + if (IG.size() == CurrentlyChecked.size()) { + return true; + } + BankSwizzle AvailableSwizzle[] = { + ALU_VEC_012, + ALU_VEC_021, + ALU_VEC_120, + ALU_VEC_102, + ALU_VEC_201, + ALU_VEC_210 + }; + CurrentlyChecked.push_back(IG[CurrentlyChecked.size()]); + for (unsigned i = 0; i < 6; i++) { + SwzCandidate.push_back(AvailableSwizzle[i]); + if (recursiveFitsFPLimitation(IG, PV, SwzCandidate, CurrentlyChecked)) + return true; + SwzCandidate.pop_back(); + } + return false; + } + + bool fitsReadPortLimitation( + std::vector<MachineInstr *> IG, + const std::vector<unsigned> &PV) + const { + //Todo : support shared src0 - src1 operand + std::vector<BankSwizzle> SwzCandidate; + bool Result = recursiveFitsFPLimitation(IG, PV, SwzCandidate, + std::vector<MachineInstr *>()); + if (!Result) + return false; + for (unsigned i = 0, e = IG.size(); i < e; i++) { + MachineInstr *MI = IG[i]; + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + R600Operands::BANK_SWIZZLE); + MI->getOperand(Op).setImm(SwzCandidate[i]); + } + return true; + } +}; + +bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + + // Instantiate the packetizer. + R600PacketizerList Packetizer(Fn, MLI, MDT); + + // DFA state table should not be empty. + assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + + // + // Loop over all basic blocks and remove KILL pseudo-instructions + // These instructions confuse the dependence analysis. Consider: + // D0 = ... (Insn 0) + // R0 = KILL R0, D0 (Insn 1) + // R0 = ... (Insn 2) + // Here, Insn 1 will result in the dependence graph not emitting an output + // dependence between Insn 0 and Insn 2. This can lead to incorrect + // packetization + // + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock::iterator End = MBB->end(); + MachineBasicBlock::iterator MI = MBB->begin(); + while (MI != End) { + if (MI->isKill()) { + MachineBasicBlock::iterator DeleteMI = MI; + ++MI; + MBB->erase(DeleteMI); + End = MBB->end(); + continue; + } + ++MI; + } + } + + // Loop over all of the basic blocks. + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + // Find scheduling regions and schedule / packetize each region. + unsigned RemainingCount = MBB->size(); + for(MachineBasicBlock::iterator RegionEnd = MBB->end(); + RegionEnd != MBB->begin();) { + // The next region starts above the previous region. Look backward in the + // instruction stream until we find the nearest boundary. + MachineBasicBlock::iterator I = RegionEnd; + for(;I != MBB->begin(); --I, --RemainingCount) { + if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn)) + break; + } + I = MBB->begin(); + + // Skip empty scheduling regions. + if (I == RegionEnd) { + RegionEnd = llvm::prior(RegionEnd); + --RemainingCount; + continue; + } + // Skip regions with one instruction. + if (I == llvm::prior(RegionEnd)) { + RegionEnd = llvm::prior(RegionEnd); + continue; + } + + Packetizer.PacketizeMIs(MBB, I, RegionEnd); + RegionEnd = I; + } + } + + return true; + +} + +} + +llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { + return new R600Packetizer(tm); +} + +#endif // R600PACKETIZER_CPP diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index ce5994c..5a2e65c 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -43,6 +43,37 @@ foreach Index = 0-127 in { Index>; } +// KCACHE_BANK0 +foreach Index = 159-128 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#Index#"-128]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#Index#"-128].XYZW", + [!cast<Register>("KC0_"#Index#"_X"), + !cast<Register>("KC0_"#Index#"_Y"), + !cast<Register>("KC0_"#Index#"_Z"), + !cast<Register>("KC0_"#Index#"_W")], + Index>; +} + +// KCACHE_BANK1 +foreach Index = 191-160 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#Index#"-160]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#Index#"-160].XYZW", + [!cast<Register>("KC1_"#Index#"_X"), + !cast<Register>("KC1_"#Index#"_Y"), + !cast<Register>("KC1_"#Index#"_Z"), + !cast<Register>("KC1_"#Index#"_W")], + Index>; +} + + // Array Base Register holding input in FS foreach Index = 448-480 in { def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; @@ -57,8 +88,14 @@ def NEG_ONE : R600Reg<"-1.0", 249>; def ONE_INT : R600Reg<"1", 250>; def HALF : R600Reg<"0.5", 252>; def NEG_HALF : R600Reg<"-0.5", 252>; -def ALU_LITERAL_X : R600Reg<"literal.x", 253>; -def PV_X : R600Reg<"pv.x", 254>; +def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; +def ALU_LITERAL_Y : R600RegWithChan<"literal.x", 253, "Y">; +def ALU_LITERAL_Z : R600RegWithChan<"literal.x", 253, "Z">; +def ALU_LITERAL_W : R600RegWithChan<"literal.x", 253, "W">; +def PV_X : R600RegWithChan<"PV.x", 254, "X">; +def PV_Y : R600RegWithChan<"PV.y", 254, "Y">; +def PV_Z : R600RegWithChan<"PV.z", 254, "Z">; +def PV_W : R600RegWithChan<"PV.w", 254, "W">; def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; @@ -80,6 +117,38 @@ def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", } // End isAllocatable = 0 +def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_X", 128, 159))>; + +def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Y", 128, 159))>; + +def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Z", 128, 159))>; + +def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_W", 128, 159))>; + +def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC0_X, R600_KC0_Y, + R600_KC0_Z, R600_KC0_W)>; + +def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_X", 160, 191))>; + +def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Y", 160, 191))>; + +def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Z", 160, 191))>; + +def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_W", 160, 191))>; + +def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC1_X, R600_KC1_Y, + R600_KC1_Z, R600_KC1_W)>; + def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "T%u_X", 0, 127), AR_X)>; diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td index 7ede181..78a460a 100644 --- a/lib/Target/R600/R600Schedule.td +++ b/lib/Target/R600/R600Schedule.td @@ -24,7 +24,7 @@ def AnyALU : InstrItinClass; def VecALU : InstrItinClass; def TransALU : InstrItinClass; -def R600_EG_Itin : ProcessorItineraries < +def R600_VLIW5_Itin : ProcessorItineraries < [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], [], [ @@ -34,3 +34,14 @@ def R600_EG_Itin : ProcessorItineraries < InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> ] >; + +def R600_VLIW4_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h new file mode 100644 index 0000000..716b093 --- /dev/null +++ b/lib/Target/R600/SIDefines.h @@ -0,0 +1,22 @@ +//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef SIDEFINES_H_ +#define SIDEFINES_H_ + +#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 +#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) +#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) +#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 93f8c38..1a07aff 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -49,6 +49,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -70,11 +71,15 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); - setSchedulingPreference(Sched::Source); + setSchedulingPreference(Sched::RegPressure); } SDValue SITargetLowering::LowerFormalArguments( @@ -208,28 +213,15 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); - MachineBasicBlock::iterator I = MI; switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_WQM: - LowerSI_WQM(MI, *BB, I, MRI); - break; } return BB; } -void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); - - MI->eraseFromParent(); -} - EVT SITargetLowering::getSetCCResultType(EVT VT) const { return MVT::i1; } @@ -247,6 +239,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); } return SDValue(); } @@ -345,6 +338,32 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +#define RSRC_DATA_FORMAT 0xf00000000000 + +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue VirtualAddress = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) { + return SDValue(); + } + + SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + DAG.getConstant(0, MVT::i64), + DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64)); + + SDValue Ops[2]; + Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain, + Value, SrcSrc, VirtualAddress); + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); + +} + SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -437,9 +456,12 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { float F; } Imm; - if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { + if (Node->getZExtValue() >> 32) { + return -1; + } Imm.I = Node->getSExtValue(); - else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) + } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) Imm.F = Node->getValueAPF().convertToFloat(); else return -1; // It isn't an immediate @@ -497,22 +519,23 @@ bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op, MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDNode *Node = Op.getNode(); - int OpClass; + const TargetRegisterClass *OpClass; if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) { const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); - OpClass = Desc.OpInfo[Op.getResNo()].RegClass; + int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; + if (OpClassID == -1) + OpClass = getRegClassFor(Op.getSimpleValueType()); + else + OpClass = TRI->getRegClass(OpClassID); } else if (Node->getOpcode() == ISD::CopyFromReg) { RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode()); - OpClass = MRI.getRegClass(Reg->getReg())->getID(); + OpClass = MRI.getRegClass(Reg->getReg()); } else return false; - if (OpClass == -1) - return false; - - return TRI->getRegClass(RegClass)->hasSubClassEq(TRI->getRegClass(OpClass)); + return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass); } /// \brief Make sure that we don't exeed the number of allowed scalars @@ -546,8 +569,9 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, Operand = SDValue(Node, 0); } -SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, - SelectionDAG &DAG) const { +/// \brief Try to fold the Nodes operands into the Node +SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, + SelectionDAG &DAG) const { // Original encoding (either e32 or e64) int Opcode = Node->getMachineOpcode(); @@ -556,6 +580,13 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, unsigned NumDefs = Desc->getNumDefs(); unsigned NumOps = Desc->getNumOperands(); + // Commuted opcode if available + int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; + const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); + + assert(!DescRev || DescRev->getNumDefs() == NumDefs); + assert(!DescRev || DescRev->getNumOperands() == NumOps); + // e64 version if available, -1 otherwise int OpcodeE64 = AMDGPU::getVOPe64(Opcode); const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); @@ -608,41 +639,54 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, // Is this a VSrc or SSrc operand ? unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (!isVSrc(RegClass) && !isSSrc(RegClass)) { + if (isVSrc(RegClass) || isSSrc(RegClass)) { + // Try to fold the immediates + if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { + // Folding didn't worked, make sure we don't hit the SReg limit + ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); + } + continue; + } + + if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { - if (i == 1 && Desc->isCommutable() && - fitsRegClass(DAG, Ops[0], RegClass) && - foldImm(Ops[1], Immediate, ScalarSlotUsed)) { + unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; + assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); - assert(isVSrc(Desc->OpInfo[NumDefs].RegClass) || - isSSrc(Desc->OpInfo[NumDefs].RegClass)); + // Test if it makes sense to swap operands + if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || + (!fitsRegClass(DAG, Ops[1], RegClass) && + fitsRegClass(DAG, Ops[1], OtherRegClass))) { // Swap commutable operands SDValue Tmp = Ops[1]; Ops[1] = Ops[0]; Ops[0] = Tmp; - } else if (DescE64 && !Immediate) { - // Test if it makes sense to switch to e64 encoding - - RegClass = DescE64->OpInfo[Op].RegClass; - int32_t TmpImm = -1; - if ((isVSrc(RegClass) || isSSrc(RegClass)) && - foldImm(Ops[i], TmpImm, ScalarSlotUsed)) { - - Immediate = -1; - Promote2e64 = true; - Desc = DescE64; - DescE64 = 0; - } + Desc = DescRev; + DescRev = 0; + continue; } - continue; } - // Try to fold the immediates - if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't worked, make sure we don't hit the SReg limit - ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); + if (DescE64 && !Immediate) { + + // Test if it makes sense to switch to e64 encoding + unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; + if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) + continue; + + int32_t TmpImm = -1; + if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || + (!fitsRegClass(DAG, Ops[i], RegClass) && + fitsRegClass(DAG, Ops[1], OtherRegClass))) { + + // Switch to e64 encoding + Immediate = -1; + Promote2e64 = true; + Desc = DescE64; + DescE64 = 0; + } } } @@ -656,10 +700,118 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) Ops.push_back(Node->getOperand(i)); - // Either create a complete new or update the current instruction - if (Promote2e64) - return DAG.getMachineNode(OpcodeE64, Node->getDebugLoc(), - Node->getVTList(), Ops.data(), Ops.size()); - else - return DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); + // Create a complete new instruction + return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(), + Node->getVTList(), Ops); +} + +/// \brief Helper function for adjustWritemask +unsigned SubIdx2Lane(unsigned Idx) { + switch (Idx) { + default: return 0; + case AMDGPU::sub0: return 0; + case AMDGPU::sub1: return 1; + case AMDGPU::sub2: return 2; + case AMDGPU::sub3: return 3; + } +} + +/// \brief Adjust the writemask of MIMG instructions +void SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { }; + unsigned Writemask = 0, Lane = 0; + + // Try to figure out the used register components + for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); + I != E; ++I) { + + // Abort if we can't understand the usage + if (!I->isMachineOpcode() || + I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + return; + + Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + + // Abort if we have more than one user per component + if (Users[Lane]) + return; + + Users[Lane] = *I; + Writemask |= 1 << Lane; + } + + // Abort if all components are used + if (Writemask == 0xf) + return; + + // Adjust the writemask in the node + std::vector<SDValue> Ops; + Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); + for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) + Ops.push_back(Node->getOperand(i)); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); + + // If we only got one lane, replace it with a copy + if (Writemask == (1U << Lane)) { + SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + DebugLoc(), MVT::f32, + SDValue(Node, 0), RC); + DAG.ReplaceAllUsesWith(Users[Lane], Copy); + return; + } + + // Update the users of the node with the new indices + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + + SDNode *User = Users[i]; + if (!User) + continue; + + SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); + DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + + switch (Idx) { + default: break; + case AMDGPU::sub0: Idx = AMDGPU::sub1; break; + case AMDGPU::sub1: Idx = AMDGPU::sub2; break; + case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + } + } +} + +/// \brief Fold the instructions after slecting them +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + + if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) + adjustWritemask(Node, DAG); + + return foldOperands(Node, DAG); +} + +/// \brief Assign the register class depending on the number of +/// bits set in the writemask +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + if (AMDGPU::isMIMG(MI->getOpcode()) == -1) + return; + + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VReg_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MRI.setRegClass(VReg, RC); } diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index d656225..de637be 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -24,9 +24,7 @@ class SITargetLowering : public AMDGPUTargetLowering { const SIInstrInfo * TII; const TargetRegisterInfo * TRI; - void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; - + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -36,6 +34,9 @@ class SITargetLowering : public AMDGPUTargetLowering { void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, unsigned RegClass, bool &ScalarSlotUsed) const; + SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const; + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + public: SITargetLowering(TargetMachine &tm); @@ -52,6 +53,8 @@ public: virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const; + virtual void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const; int32_t analyzeImmediate(const SDNode *N) const; }; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 3891ddb..f737ddd 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -284,33 +284,33 @@ let Uses = [EXEC] in { class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : Enc64<outs, ins, asm, pattern> { - bits<8> VDATA; - bits<12> OFFSET; - bits<1> OFFEN; - bits<1> IDXEN; - bits<1> GLC; - bits<1> ADDR64; - bits<1> LDS; - bits<8> VADDR; - bits<7> SRSRC; - bits<1> SLC; - bits<1> TFE; - bits<8> SOFFSET; - - let Inst{11-0} = OFFSET; - let Inst{12} = OFFEN; - let Inst{13} = IDXEN; - let Inst{14} = GLC; - let Inst{15} = ADDR64; - let Inst{16} = LDS; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{16} = lds; let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = VADDR; - let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC{6-2}; - let Inst{54} = SLC; - let Inst{55} = TFE; - let Inst{63-56} = SOFFSET; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; let VM_CNT = 1; let EXP_CNT = 1; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index de2373b..9a04c60 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -58,6 +58,10 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 }; + const int16_t Sub0_2[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + }; + const int16_t Sub0_1[] = { AMDGPU::sub0, AMDGPU::sub1, 0 }; @@ -65,6 +69,26 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; const int16_t *SubIndices; + if (AMDGPU::M0 == DestReg) { + // Check if M0 isn't already set to this value + for (MachineBasicBlock::reverse_iterator E = MBB.rend(), + I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) { + + if (!I->definesRegister(AMDGPU::M0)) + continue; + + unsigned Opc = I->getOpcode(); + if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32) + break; + + if (!I->readsRegister(SrcReg)) + break; + + // The copy isn't necessary + return; + } + } + if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) @@ -105,6 +129,11 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_1; + } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_2; + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || AMDGPU::SReg_128RegClass.contains(SrcReg)); @@ -138,6 +167,21 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } +unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { + + int NewOpc; + + // Try to map original to commuted opcode + if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1) + return NewOpc; + + // Try to map commuted to original opcode + if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1) + return NewOpc; + + return Opcode; +} + MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { @@ -145,7 +189,12 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, !MI->getOperand(2).isReg()) return 0; - return TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + + if (MI) + MI->setDesc(get(commuteOpcode(MI->getOpcode()))); + + return MI; } MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg, diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 5789af5..87eff4d 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -35,6 +35,8 @@ public: unsigned DestReg, unsigned SrcReg, bool KillSrc) const; + unsigned commuteOpcode(unsigned Opcode) const; + virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI=false) const; @@ -76,6 +78,9 @@ public: namespace AMDGPU { int getVOPe64(uint16_t Opcode); + int getCommuteRev(uint16_t Opcode); + int getCommuteOrig(uint16_t Opcode); + int isMIMG(uint16_t Opcode); } // End namespace AMDGPU diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 2f10c38..aafc331 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -26,6 +26,10 @@ def HI32 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32); }]>; +def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; + def IMM8bitDWORD : ImmLeaf < i32, [{ return (Imm & ~0x3FC) == 0; @@ -138,6 +142,11 @@ class VOP <string opName> { string OpName = opName; } +class VOP2_REV <string revOp, bit isOrig> { + string RevOp = revOp; + bit IsOrig = isOrig; +} + multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src, string opName, list<dag> pattern> { @@ -166,11 +175,11 @@ multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>; multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, - string opName, list<dag> pattern> { + string opName, list<dag> pattern, string revOp> { def _e32 : VOP2 < op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName#"_e32 $dst, $src0, $src1", pattern - >, VOP <opName>; + >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; def _e64 : VOP3 < {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, @@ -179,23 +188,26 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", [] - >, VOP <opName> { + >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { let SRC2 = SIOperand.ZERO; } } -multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> - : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern>; +multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern, + string revOp = opName> + : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>; -multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> - : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern>; +multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern, + string revOp = opName> + : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>; -multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern> { +multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern, + string revOp = opName> { def _e32 : VOP2 < op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1), opName#"_e32 $dst, $src0, $src1", pattern - >, VOP <opName>; + >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; def _e64 : VOP3b < {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, @@ -204,7 +216,7 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern> { i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", [] - >, VOP <opName> { + >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { let SRC2 = SIOperand.ZERO; /* the VOP2 variant puts the carry out into VCC, the VOP3 variant can write it into any SGPR. We currently don't use the carry out, @@ -247,14 +259,14 @@ multiclass VOPC_64 <bits<8> op, string opName, class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, - i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern >, VOP <opName>; class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2, - i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern >, VOP <opName>; @@ -277,17 +289,39 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF < op, - (outs regClass:$dst), + (outs regClass:$vdata), (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), - asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, " + asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, " #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset", []> { let mayLoad = 1; let mayStore = 0; } +class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, + ValueType VT> : + MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr), + name#" $vdata, $srsrc + $vaddr", + [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc), + (i64 VReg_64:$vaddr))]> { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let offset = 0; + let offen = 0; + let idxen = 0; + let glc = 0; + let addr64 = 1; + let lds = 0; + let slc = 0; + let tfe = 0; + let soffset = 128; // ZERO +} + class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs regClass:$dst), @@ -305,13 +339,14 @@ class MIMG_Load_Helper <bits<7> op, string asm> : MIMG < op, (outs VReg_128:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp), asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", []> { let mayLoad = 1; let mayStore = 0; + let hasPostISelHook = 1; } //===----------------------------------------------------------------------===// @@ -327,4 +362,31 @@ def getVOPe64 : InstrMapping { let ValueCols = [["8"]]; } +// Maps an original opcode to its commuted version +def getCommuteRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +// Maps an commuted opcode to its original version +def getCommuteOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Test if the supplied opcode is an MIMG instruction +def isMIMG : InstrMapping { + let FilterClass = "MIMG_Load_Helper"; + let RowFields = ["Inst"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["8"]]; +} + include "SIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 05b04a9..3ff4548 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -108,7 +108,7 @@ VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 def S_CMPK_EQ_I32 : SOPK < 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), "S_CMPK_EQ_I32", - [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))] + [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] >; */ @@ -408,8 +408,14 @@ def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; -//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>; -//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>; + +def BUFFER_STORE_DWORD : MUBUF_Store_Helper < + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32 +>; + +def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64 +>; //def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; @@ -594,12 +600,12 @@ defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>; //defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>; //defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>; defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", - [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))] + [(set f32:$dst, (sint_to_fp i32:$src0))] >; -//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; -//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; +defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; +defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", - [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))] + [(set i32:$dst, (fp_to_sint f32:$src0))] >; defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; ////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; @@ -616,35 +622,35 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; //defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; //defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", - [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))] + [(set f32:$dst, (AMDGPUfract f32:$src0))] >; defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>; defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", - [(set VReg_32:$dst, (fceil VSrc_32:$src0))] + [(set f32:$dst, (fceil f32:$src0))] >; defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", - [(set VReg_32:$dst, (frint VSrc_32:$src0))] + [(set f32:$dst, (frint f32:$src0))] >; defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", - [(set VReg_32:$dst, (ffloor VSrc_32:$src0))] + [(set f32:$dst, (ffloor f32:$src0))] >; defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", - [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))] + [(set f32:$dst, (fexp2 f32:$src0))] >; defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", - [(set VReg_32:$dst, (flog2 VSrc_32:$src0))] + [(set f32:$dst, (flog2 f32:$src0))] >; defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))] + [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))] + [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] >; defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>; @@ -787,14 +793,13 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", - [(set (i32 VReg_32:$dst), (select (i1 SSrc_64:$src2), - VSrc_32:$src1, VSrc_32:$src0))] + [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))] >; //f32 pattern for V_CNDMASK_B32_e64 def : Pat < - (f32 (select (i1 SSrc_64:$src2), VSrc_32:$src1, VSrc_32:$src0)), - (V_CNDMASK_B32_e64 VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2) + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>; @@ -802,26 +807,26 @@ defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", - [(set VReg_32:$dst, (fadd VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (fadd f32:$src0, f32:$src1))] >; -} // End isCommutable = 1 defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", - [(set VReg_32:$dst, (fsub VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (fsub f32:$src0, f32:$src1))] >; +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">; +} // End isCommutable = 1 -defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>; defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; let isCommutable = 1 in { defm V_MUL_LEGACY_F32 : VOP2_32 < 0x00000007, "V_MUL_LEGACY_F32", - [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))] >; defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", - [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (fmul f32:$src0, f32:$src1))] >; } // End isCommutable = 1 @@ -834,11 +839,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", let isCommutable = 1 in { defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", - [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))] >; defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", - [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))] >; defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; @@ -848,27 +853,29 @@ defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>; defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>; defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>; -} // End isCommutable = 1 +defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", + [(set i32:$dst, (srl i32:$src0, i32:$src1))] +>; +defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">; -defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>; -defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>; -defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>; -defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>; -defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", - [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))] +defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", + [(set i32:$dst, (sra i32:$src0, i32:$src1))] >; -defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>; +defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">; -let isCommutable = 1 in { +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", + [(set i32:$dst, (shl i32:$src0, i32:$src1))] +>; +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">; defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", - [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (and i32:$src0, i32:$src1))] >; defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", - [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (or i32:$src0, i32:$src1))] >; defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", - [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (xor i32:$src0, i32:$src1))] >; } // End isCommutable = 1 @@ -880,31 +887,30 @@ defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; //defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; //defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; -let Defs = [VCC] in { // Carry-out goes to VCC -let isCommutable = 1 in { +let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", - [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] + [(set i32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] >; -} // End isCommutable = 1 defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", - [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] + [(set i32:$dst, (sub i32:$src0, i32:$src1))] >; +defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">; -defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", []>; let Uses = [VCC] in { // Carry-out comes from VCC defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>; defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>; -defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", []>; +defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], "V_SUBB_U32">; } // End Uses = [VCC] -} // End Defs = [VCC] +} // End isCommutable = 1, Defs = [VCC] + defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; ////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>; ////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; ////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", - [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))] >; ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; @@ -941,6 +947,7 @@ def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>; def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>; def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>; +defm : BFIPatterns <V_BFI_B32>; def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>; def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>; //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; @@ -971,14 +978,31 @@ def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>; def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>; def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; + +let isCommutable = 1 in { + def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; + +} // isCommutable = 1 + def : Pat < - (mul VSrc_32:$src0, VReg_32:$src1), - (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0) + (mul i32:$src0, i32:$src1), + (V_MUL_LO_I32 $src0, $src1, (i32 0)) >; -def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; + +def : Pat < + (mulhu i32:$src0, i32:$src1), + (V_MUL_HI_U32 $src0, $src1, (i32 0)) +>; + +def : Pat < + (mulhs i32:$src0, i32:$src1), + (V_MUL_HI_I32 $src0, $src1, (i32 0)) +>; + def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; @@ -1001,34 +1025,27 @@ def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>; def S_CSELECT_B32 : SOP2 < 0x0000000a, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", - [(set (i32 SReg_32:$dst), (select (i1 SCCReg:$scc), - SReg_32:$src0, SReg_32:$src1))] + [] >; def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; -// f32 pattern for S_CSELECT_B32 -def : Pat < - (f32 (select (i1 SCCReg:$scc), SReg_32:$src0, SReg_32:$src1)), - (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc) ->; - def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>; def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", - [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))] + [(set i64:$dst, (and i64:$src0, i64:$src1))] >; def : Pat < - (i1 (and SSrc_64:$src0, SSrc_64:$src1)), - (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1) + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) >; def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>; def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>; def : Pat < - (i1 (or SSrc_64:$src0, SSrc_64:$src1)), - (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1) + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) >; def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>; def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>; @@ -1067,17 +1084,6 @@ def LOAD_CONST : AMDGPUShaderInst < [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))] >; -let usesCustomInserter = 1 in { - -def SI_WQM : InstSI < - (outs), - (ins), - "SI_WQM", - [(int_SI_wqm)] ->; - -} // end usesCustomInserter - // SI Psuedo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. @@ -1090,14 +1096,14 @@ def SI_IF : InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), "SI_IF $dst, $vcc, $target", - [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))] + [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] >; def SI_ELSE : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), "SI_ELSE $dst, $src, $target", - [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> { + [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> { let Constraints = "$src = $dst"; } @@ -1106,7 +1112,7 @@ def SI_LOOP : InstSI < (outs), (ins SReg_64:$saved, brtarget:$target), "SI_LOOP $saved, $target", - [(int_SI_loop SReg_64:$saved, bb:$target)] + [(int_SI_loop i64:$saved, bb:$target)] >; } // end isBranch = 1, isTerminator = 1 @@ -1115,35 +1121,35 @@ def SI_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src), "SI_ELSE $dst, $src", - [(set SReg_64:$dst, (int_SI_break SReg_64:$src))] + [(set i64:$dst, (int_SI_break i64:$src))] >; def SI_IF_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), "SI_IF_BREAK $dst, $vcc, $src", - [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))] + [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] >; def SI_ELSE_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), "SI_ELSE_BREAK $dst, $src0, $src1", - [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))] + [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] >; def SI_END_CF : InstSI < (outs), (ins SReg_64:$saved), "SI_END_CF $saved", - [(int_SI_end_cf SReg_64:$saved)] + [(int_SI_end_cf i64:$saved)] >; def SI_KILL : InstSI < (outs), (ins VReg_32:$src), "SI_KIL $src", - [(int_AMDGPU_kill VReg_32:$src)] + [(int_AMDGPU_kill f32:$src)] >; } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 @@ -1177,8 +1183,8 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; } // end IsCodeGenOnly, isPseudo def : Pat< - (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), - (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0)) + (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), + (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0)) >; def : Pat < @@ -1188,99 +1194,80 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< - (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset, - VReg_32:$buf_idx_vgpr), + (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, + i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0, - VReg_32:$buf_idx_vgpr, SReg_128:$tlst, - 0, 0, 0) + $buf_idx_vgpr, $tlst, 0, 0, 0) >; /* int_SI_export */ def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, - VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + f32:$src0, f32:$src1, f32:$src2, f32:$src3), (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, - VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3) + $src0, $src1, $src2, $src3) >; +/********** ======================= **********/ +/********** Image sampling patterns **********/ +/********** ======================= **********/ /* int_SI_sample for simple 1D texture lookup */ def : Pat < - (int_SI_sample imm:$writemask, (v1i32 VReg_32:$addr), - SReg_256:$rsrc, SReg_128:$sampler, imm), - (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, - (i32 (COPY_TO_REGCLASS VReg_32:$addr, VReg_32)), - SReg_256:$rsrc, SReg_128:$sampler) + (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm), + (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, - ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, imm), - (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, - (EXTRACT_SUBREG addr_class:$addr, sub0), - SReg_256:$rsrc, SReg_128:$sampler) +class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, - ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT), - (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0, - (EXTRACT_SUBREG addr_class:$addr, sub0), - SReg_256:$rsrc, SReg_128:$sampler) +class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT), + (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, - ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY), - (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, - (EXTRACT_SUBREG addr_class:$addr, sub0), - SReg_256:$rsrc, SReg_128:$sampler) +class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleShadowPattern<Intrinsic name, MIMG opcode, - RegisterClass addr_class, ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW), - (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, - (EXTRACT_SUBREG addr_class:$addr, sub0), - SReg_256:$rsrc, SReg_128:$sampler) + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleShadowArrayPattern<Intrinsic name, MIMG opcode, - RegisterClass addr_class, ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY), - (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, - (EXTRACT_SUBREG addr_class:$addr, sub0), - SReg_256:$rsrc, SReg_128:$sampler) + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; /* int_SI_sample* for texture lookups consuming more address parameters */ -multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> { - def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; - def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; - def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; - def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>; - def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>; - - def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>; - def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>; - def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>; - def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>; - - def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>; - def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>; - def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>; - def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>; +multiclass SamplePatterns<ValueType addr_type> { + def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>; + def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>; + def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>; + def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>; + def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>; + + def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>; + def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>; + def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>; + def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>; + + def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>; + def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>; + def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>; + def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>; } -defm : SamplePatterns<VReg_64, v2i32>; -defm : SamplePatterns<VReg_128, v4i32>; -defm : SamplePatterns<VReg_256, v8i32>; -defm : SamplePatterns<VReg_512, v16i32>; +defm : SamplePatterns<v2i32>; +defm : SamplePatterns<v4i32>; +defm : SamplePatterns<v8i32>; +defm : SamplePatterns<v16i32>; /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ @@ -1288,77 +1275,77 @@ defm : SamplePatterns<VReg_512, v16i32>; foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < - i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v2i32_#Index : Insert_Element < - i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v2f32_#Index : Extract_Element < - f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v2f32_#Index : Insert_Element < - f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) >; } foreach Index = 0-3 in { def Extract_Element_v4i32_#Index : Extract_Element < - i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v4i32_#Index : Insert_Element < - i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v4f32_#Index : Extract_Element < - f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v4f32_#Index : Insert_Element < - f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) >; } foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < - i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v8i32_#Index : Insert_Element < - i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v8f32_#Index : Extract_Element < - f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v8f32_#Index : Insert_Element < - f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) >; } foreach Index = 0-15 in { def Extract_Element_v16i32_#Index : Extract_Element < - i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v16i32_#Index : Insert_Element < - i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v16f32_#Index : Extract_Element < - f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v16f32_#Index : Insert_Element < - f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) >; } -def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>; -def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>; -def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>; -def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>; -def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>; -def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>; -def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>; -def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>; -def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>; +def : Vector1_Build <v1i32, i32, VReg_32>; +def : Vector2_Build <v2i32, i32>; +def : Vector2_Build <v2f32, f32>; +def : Vector4_Build <v4i32, i32>; +def : Vector4_Build <v4f32, f32>; +def : Vector8_Build <v8i32, i32>; +def : Vector8_Build <v8f32, f32>; +def : Vector16_Build <v16i32, i32>; +def : Vector16_Build <v16f32, f32>; def : BitConvert <i32, f32, SReg_32>; def : BitConvert <i32, f32, VReg_32>; @@ -1371,20 +1358,20 @@ def : BitConvert <f32, i32, VReg_32>; /********** =================== **********/ def : Pat < - (int_AMDIL_clamp VReg_32:$src, (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), 0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) >; def : Pat < - (fabs VReg_32:$src), - (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + (fabs f32:$src), + (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), 1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) >; def : Pat < - (fneg VReg_32:$src), - (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + (fneg f32:$src), + (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), 0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */) >; @@ -1425,16 +1412,16 @@ def : Pat < /********** ===================== **********/ def : Pat < - (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params), - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params) + (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params), + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params) >; def : Pat < - (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij), - (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0), - imm:$attr_chan, imm:$attr, M0Reg:$params), - (EXTRACT_SUBREG VReg_64:$ij, sub1), - imm:$attr_chan, imm:$attr, M0Reg:$params) + (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij), + (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0), + imm:$attr_chan, imm:$attr, i32:$params), + (EXTRACT_SUBREG $ij, sub1), + imm:$attr_chan, imm:$attr, $params) >; /********** ================== **********/ @@ -1442,102 +1429,111 @@ def : Pat < /********** ================== **********/ /* llvm.AMDGPU.pow */ -/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */ -def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>; +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < - (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1), - (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1)) + (int_AMDGPU_div f32:$src0, f32:$src1), + (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; def : Pat< - (fdiv VSrc_32:$src0, VSrc_32:$src1), - (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1)) + (fdiv f32:$src0, f32:$src1), + (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) >; def : Pat < - (fcos VSrc_32:$src0), - (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) + (fcos f32:$src0), + (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) >; def : Pat < - (fsin VSrc_32:$src0), - (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) + (fsin f32:$src0), + (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) >; def : Pat < - (int_AMDGPU_cube VReg_128:$src), + (int_AMDGPU_cube v4f32:$src), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), - (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub0), - (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub1), - (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub2), - (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub3) + (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub0), + (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub1), + (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub2), + (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub3) >; def : Pat < - (i32 (sext (i1 SReg_64:$src0))), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0) + (i32 (sext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) >; // 1. Offset as 8bit DWORD immediate def : Pat < - (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset) + (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset) >; // 2. Offset loaded in an 32bit SGPR def : Pat < - (int_SI_load_const SReg_128:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset)) + (int_SI_load_const v16i8:$sbase, imm:$offset), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; // 3. Offset in an 32Bit VGPR def : Pat < - (int_SI_load_const SReg_128:$sbase, VReg_32:$voff), - (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0) + (int_SI_load_const v16i8:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, $voff, $sbase, 0, 0, 0) +>; + +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (V_CVT_U32_F32_e32 + (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; /********** ================== **********/ /********** VOP3 Patterns **********/ /********** ================== **********/ -def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)), - (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, - 0, 0, 0, 0)>; +def : Pat < + (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)), + (V_MAD_F32 $src0, $src1, $src2) +>; /********** ================== **********/ /********** SMRD Patterns **********/ /********** ================== **********/ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + // 1. Offset as 8bit DWORD immediate def : Pat < - (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)), - (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset)) + (constant_load (SIadd64bit32bit i64:$sbase, IMM8bitDWORD:$offset)), + (vt (Instr_IMM $sbase, IMM8bitDWORD:$offset)) >; // 2. Offset loaded in an 32bit SGPR def : Pat < - (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)), - (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset))) + (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)), + (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset))) >; // 3. No offset at all def : Pat < - (constant_load SReg_64:$sbase), - (vt (Instr_IMM SReg_64:$sbase, 0)) + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) >; } @@ -1550,44 +1546,50 @@ defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; /********** Indirect adressing **********/ /********** ====================== **********/ -multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt, - SI_INDIRECT_DST IndDst> { +multiclass SI_INDIRECT_Pattern <ValueType vt, SI_INDIRECT_DST IndDst> { + // 1. Extract with offset def : Pat< - (vector_extract (vt rc:$vec), - (i64 (zext (i32 (add VReg_32:$idx, imm:$off)))) - ), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off)) + (vector_extract vt:$vec, (i64 (zext (add i32:$idx, imm:$off)))), + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) >; // 2. Extract without offset def : Pat< - (vector_extract (vt rc:$vec), - (i64 (zext (i32 VReg_32:$idx))) - ), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0)) + (vector_extract vt:$vec, (i64 (zext i32:$idx))), + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) >; // 3. Insert with offset def : Pat< - (vector_insert (vt rc:$vec), (f32 VReg_32:$val), - (i64 (zext (i32 (add VReg_32:$idx, imm:$off)))) - ), - (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val)) + (vector_insert vt:$vec, f32:$val, (i64 (zext (add i32:$idx, imm:$off)))), + (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< - (vector_insert (vt rc:$vec), (f32 VReg_32:$val), - (i64 (zext (i32 VReg_32:$idx))) - ), - (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val)) + (vector_insert vt:$vec, f32:$val, (i64 (zext i32:$idx))), + (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val) >; } -defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2f32, SI_INDIRECT_DST_V2>; +defm : SI_INDIRECT_Pattern <v4f32, SI_INDIRECT_DST_V4>; +defm : SI_INDIRECT_Pattern <v8f32, SI_INDIRECT_DST_V8>; +defm : SI_INDIRECT_Pattern <v16f32, SI_INDIRECT_DST_V16>; + +/********** =============== **********/ +/********** Conditions **********/ +/********** =============== **********/ + +def : Pat< + (i1 (setcc f32:$src0, f32:$src1, SETO)), + (V_CMP_O_F32_e64 $src0, $src1) +>; + +def : Pat< + (i1 (setcc f32:$src0, f32:$src1, SETUO)), + (V_CMP_U_F32_e64 $src0, $src1) +>; } // End isSI predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 33bb815..16d9d81 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -16,11 +16,10 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ; - def int_SI_wqm : Intrinsic <[], [], []>; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>; + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_sample : Sample; def int_SI_sampleb : Sample; @@ -28,8 +27,8 @@ let TargetPrefix = "SI", isTarget = 1 in { /* Interpolation Intrinsics */ - def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; - def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrReadMem]>; + def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; /* Control flow Intrinsics */ diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 9a027e7..2b60eb9 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -197,7 +197,8 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) { unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) .addReg(Src); // Saved EXEC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) @@ -409,6 +410,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { bool HaveKill = false; + bool NeedWQM = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -478,9 +480,22 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; + + case AMDGPU::V_INTERP_P1_F32: + case AMDGPU::V_INTERP_P2_F32: + case AMDGPU::V_INTERP_MOV_F32: + NeedWQM = true; + break; + } } } + if (NeedWQM) { + MachineBasicBlock &MBB = MF.front(); + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC).addReg(AMDGPU::EXEC); + } + return true; } diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index 1a4e4cb..ee0e307 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -10,25 +10,9 @@ #include "SIMachineFunctionInfo.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" using namespace llvm; -const char *SIMachineFunctionInfo::ShaderTypeAttribute = "ShaderType"; - SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) - : MachineFunctionInfo(), - ShaderType(0), - PSInputAddr(0) { - - AttributeSet Set = MF.getFunction()->getAttributes(); - Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, - ShaderTypeAttribute); - - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) - llvm_unreachable("Can't parse shader type!"); - } -} + : AMDGPUMachineFunction(MF), + PSInputAddr(0) { } diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 91a809b..6da9f7f 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -15,18 +15,15 @@ #ifndef SIMACHINEFUNCTIONINFO_H_ #define SIMACHINEFUNCTIONINFO_H_ -#include "llvm/CodeGen/MachineFunction.h" +#include "AMDGPUMachineFunction.h" namespace llvm { /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo : public MachineFunctionInfo { +class SIMachineFunctionInfo : public AMDGPUMachineFunction { public: - static const char *ShaderTypeAttribute; - SIMachineFunctionInfo(const MachineFunction &MF); - unsigned ShaderType; unsigned PSInputAddr; }; diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 88275c5..99278ae 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -30,6 +30,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + return RC->getNumRegs(); +} + const TargetRegisterClass * SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { switch (rc->getID()) { diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 40171e4..caec228 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -31,6 +31,9 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { virtual BitVector getReservedRegs(const MachineFunction &MF) const; + virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const; + /// \param RC is an AMDIL reg class. /// /// \returns the SI register class that is equivalent to \p RC. diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 4f14931..244d4c0 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -94,6 +94,12 @@ def VGPR_64 : RegisterTuples<[sub0, sub1], [(add (trunc VGPR_32, 255)), (add (shl VGPR_32, 1))]>; +// VGPR 96-bit registers +def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], + [(add (trunc VGPR_32, 254)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2))]>; + // VGPR 128-bit registers def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], [(add (trunc VGPR_32, 253)), @@ -151,7 +157,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SGPR_64, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>; @@ -162,6 +168,10 @@ def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>; def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { + let Size = 96; +} + def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>; |