diff options
Diffstat (limited to 'lib/Target/R600/R600ISelLowering.cpp')
-rw-r--r-- | lib/Target/R600/R600ISelLowering.cpp | 691 |
1 files changed, 486 insertions, 205 deletions
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index f0eece3..b5c2a93 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -16,6 +16,7 @@ #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -71,10 +72,27 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); setSchedulingPreference(Sched::VLIW); } @@ -115,15 +133,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - case AMDGPU::R600_LOAD_CONST: { - int64_t RegIndex = MI->getOperand(1).getImm(); - unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY)) - .addOperand(MI->getOperand(0)) - .addReg(ConstantReg); - break; - } - case AMDGPU::MASK_WRITE: { unsigned maskedRegister = MI->getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); @@ -154,18 +163,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - case AMDGPU::RESERVE_REG: { - R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>(); - int64_t ReservedIndex = MI->getOperand(0).getImm(); - unsigned ReservedReg = - AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex); - MFI->ReservedRegs.push_back(ReservedReg); - unsigned SuperReg = - AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4); - MFI->ReservedRegs.push_back(SuperReg); - break; - } - case AMDGPU::TXD: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); @@ -250,33 +247,26 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - case AMDGPU::input_perspective: { - R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); - - // XXX Be more fine about register reservation - for (unsigned i = 0; i < 4; i ++) { - unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i); - MFI->ReservedRegs.push_back(ReservedReg); - } - - switch (MI->getOperand(1).getImm()) { - case 0:// Perspective - MFI->HasPerspectiveInterpolation = true; - break; - case 1:// Linear - MFI->HasLinearInterpolation = true; - break; - default: - assert(0 && "Unknow ij index"); - } - - return BB; - } - case AMDGPU::EG_ExportSwz: case AMDGPU::R600_ExportSwz: { + // Instruction is left unmodified if its not the last one of its type + bool isLastInstructionOfItsType = true; + unsigned InstExportType = MI->getOperand(1).getImm(); + for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), + EndBlock = BB->end(); NextExportInst != EndBlock; + NextExportInst = llvm::next(NextExportInst)) { + if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || + NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + unsigned CurrentInstExportType = NextExportInst->getOperand(1) + .getImm(); + if (CurrentInstExportType == InstExportType) { + isLastInstructionOfItsType = false; + break; + } + } + } bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; - if (!EOP) + if (!EOP && !isLastInstructionOfItsType) return BB; unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) @@ -288,9 +278,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( .addOperand(MI->getOperand(5)) .addOperand(MI->getOperand(6)) .addImm(CfInst) - .addImm(1); + .addImm(EOP); break; } + case AMDGPU::RETURN: { + // RETURN instructions must have the live-out registers as implicit uses, + // otherwise they appear dead. + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + MachineInstrBuilder MIB(*MF, MI); + for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) + MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); + return BB; + } } MI->eraseFromParent(); @@ -304,57 +303,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( using namespace llvm::Intrinsic; using namespace llvm::AMDGPUIntrinsic; -static SDValue -InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap, - unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type, - SDValue Scalar, SDValue Chain) { - if (!ExportMap[Slot]) { - SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, - DL, MVT::v4f32, - DAG.getUNDEF(MVT::v4f32), - Scalar, - DAG.getConstant(Channel, MVT::i32)); - - unsigned Mask = 1 << Channel; - - const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32), - DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32), - DAG.getConstant(Mask, MVT::i32)}; - - SDValue Res = DAG.getNode( - AMDGPUISD::EXPORT, - DL, - MVT::Other, - Ops, 6); - ExportMap[Slot] = Res.getNode(); - return Res; - } - - SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ; - SDValue PreviousVector = ExportInstruction->getOperand(1); - SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, - DL, MVT::v4f32, - PreviousVector, - Scalar, - DAG.getConstant(Channel, MVT::i32)); - - unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5)) - ->getZExtValue(); - Mask |= (1 << Channel); - - const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector, - DAG.getConstant(Inst, MVT::i32), - DAG.getConstant(Type, MVT::i32), - DAG.getConstant(Slot, MVT::i32), - DAG.getConstant(Mask, MVT::i32)}; - - DAG.UpdateNodeOperands(ExportInstruction, - Ops, 6); - - return Chain; - -} - SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -364,7 +312,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::FPOW: return LowerFPOW(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = @@ -372,58 +322,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const switch (IntrinsicID) { case AMDGPUIntrinsic::AMDGPU_store_output: { MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - if (!MRI.isLiveOut(Reg)) { - MRI.addLiveOut(Reg); - } + MFI->LiveOuts.push_back(Reg); return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2)); } - case AMDGPUIntrinsic::R600_store_pixel_color: { - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - - SDNode **OutputsMap = MFI->Outputs; - return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap, - RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2), - Chain); - + case AMDGPUIntrinsic::R600_store_swizzle: { + const SDValue Args[8] = { + Chain, + Op.getOperand(2), // Export Value + Op.getOperand(3), // ArrayBase + Op.getOperand(4), // Type + DAG.getConstant(0, MVT::i32), // SWZ_X + DAG.getConstant(1, MVT::i32), // SWZ_Y + DAG.getConstant(2, MVT::i32), // SWZ_Z + DAG.getConstant(3, MVT::i32) // SWZ_W + }; + return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(), + Args, 8); } - case AMDGPUIntrinsic::R600_store_stream_output : { - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - int64_t BufIndex = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); - - SDNode **OutputsMap = MFI->StreamOutputs[BufIndex]; - unsigned Inst; - switch (cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue() ) { - // STREAM3 - case 3: - Inst = 4; - break; - // STREAM2 - case 2: - Inst = 3; - break; - // STREAM1 - case 1: - Inst = 2; - break; - // STREAM0 - case 0: - Inst = 1; - break; - default: - llvm_unreachable("Wrong buffer id for stream outputs !"); - } - return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap, - RegIndex / 4, RegIndex % 4, Inst, 0, Op.getOperand(2), - Chain); - } // default for switch(IntrinsicID) default: break; } @@ -442,38 +361,35 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT); } - case AMDGPUIntrinsic::R600_load_input_perspective: { - int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (slot < 0) - return DAG.getUNDEF(MVT::f32); - SDValue FullVector = DAG.getNode( - AMDGPUISD::INTERP, - DL, MVT::v4f32, - DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32)); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, - DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32)); - } - case AMDGPUIntrinsic::R600_load_input_linear: { - int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (slot < 0) - return DAG.getUNDEF(MVT::f32); - SDValue FullVector = DAG.getNode( - AMDGPUISD::INTERP, - DL, MVT::v4f32, - DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32)); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, - DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32)); - } - case AMDGPUIntrinsic::R600_load_input_constant: { + + case AMDGPUIntrinsic::R600_interp_input: { int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (slot < 0) - return DAG.getUNDEF(MVT::f32); - SDValue FullVector = DAG.getNode( - AMDGPUISD::INTERP_P0, - DL, MVT::v4f32, - DAG.getConstant(slot / 4 , MVT::i32)); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, - DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32)); + int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); + MachineSDNode *interp; + if (ijb < 0) { + interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, + MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); + return DAG.getTargetExtractSubreg( + TII->getRegisterInfo().getSubRegFromChannel(slot % 4), + DL, MVT::f32, SDValue(interp, 0)); + } + + if (slot % 4 < 2) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); + + return SDValue(interp, slot % 2); } case r600_read_ngroups_x: @@ -527,6 +443,20 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: return; case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: + SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + return; } } @@ -594,6 +524,20 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, false, false, false, 0); } +SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); +} + SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -680,9 +624,12 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const } // Try to lower to a SET* instruction: - // We need all the operands of SELECT_CC to have the same value type, so if - // necessary we need to change True and False to be the same type as LHS and - // RHS, and then convert the result of the select_cc back to the correct type. + // + // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware, + // but for the other case where CompareVT != VT, all operands of + // SELECT_CC need to have the same value type, so we need to change True and + // False to be the same type as LHS and RHS, and then convert the result of + // the select_cc back to the correct type. // Move hardware True/False values to the correct operand. if (isHWTrueValue(False) && isHWFalseValue(True)) { @@ -692,32 +639,17 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const } if (isHWTrueValue(True) && isHWFalseValue(False)) { - if (CompareVT != VT) { - if (VT == MVT::f32 && CompareVT == MVT::i32) { - SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - LHS, RHS, - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - CC); - // Convert integer values of true (-1) and false (0) to fp values of - // true (1.0f) and false (0.0f). - SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean, - DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB); - } else if (VT == MVT::i32 && CompareVT == MVT::f32) { - SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - LHS, RHS, - DAG.getConstantFP(1.0f, MVT::f32), - DAG.getConstantFP(0.0f, MVT::f32), - CC); - // Convert fp values of true (1.0f) and false (0.0f) to integer values - // of true (-1) and false (0). - SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt); - return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg); - } else { - // I don't think there will be any other type pairings. - assert(!"Unhandled operand type parings in SELECT_CC"); - } + if (CompareVT != VT && VT == MVT::f32 && CompareVT == MVT::i32) { + SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, + LHS, RHS, + DAG.getConstant(-1, MVT::i32), + DAG.getConstant(0, MVT::i32), + CC); + // Convert integer values of true (-1) and false (0) to fp values of + // true (1.0f) and false (0.0f). + SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean, + DAG.getConstant(1, MVT::i32)); + return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB); } else { // This SELECT_CC is already legal. return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); @@ -808,6 +740,61 @@ SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return Cond; } +/// LLVM generates byte-addresed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); StoreSDNode *StoreNode = cast<StoreSDNode>(Op); @@ -829,9 +816,188 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } return Chain; } - return SDValue(); + + EVT ValueVT = Value.getValueType(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (ValueVT.isVector()) { + unsigned NumElemVT = ValueVT.getVectorNumElements(); + EVT ElemVT = ValueVT.getVectorElementType(); + SDValue Stores[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, MVT::i32)); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); + } else { + if (ValueVT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, MVT::i32)); // Channel + } + + return Chain; +} + +// return (512 + (kc_bank << 12) +static int +ConstantAddressBlock(unsigned AddressSpace) { + switch (AddressSpace) { + case AMDGPUAS::CONSTANT_BUFFER_0: + return 512; + case AMDGPUAS::CONSTANT_BUFFER_1: + return 512 + 4096; + case AMDGPUAS::CONSTANT_BUFFER_2: + return 512 + 4096 * 2; + case AMDGPUAS::CONSTANT_BUFFER_3: + return 512 + 4096 * 3; + case AMDGPUAS::CONSTANT_BUFFER_4: + return 512 + 4096 * 4; + case AMDGPUAS::CONSTANT_BUFFER_5: + return 512 + 4096 * 5; + case AMDGPUAS::CONSTANT_BUFFER_6: + return 512 + 4096 * 6; + case AMDGPUAS::CONSTANT_BUFFER_7: + return 512 + 4096 * 7; + case AMDGPUAS::CONSTANT_BUFFER_8: + return 512 + 4096 * 8; + case AMDGPUAS::CONSTANT_BUFFER_9: + return 512 + 4096 * 9; + case AMDGPUAS::CONSTANT_BUFFER_10: + return 512 + 4096 * 10; + case AMDGPUAS::CONSTANT_BUFFER_11: + return 512 + 4096 * 11; + case AMDGPUAS::CONSTANT_BUFFER_12: + return 512 + 4096 * 12; + case AMDGPUAS::CONSTANT_BUFFER_13: + return 512 + 4096 * 13; + case AMDGPUAS::CONSTANT_BUFFER_14: + return 512 + 4096 * 14; + case AMDGPUAS::CONSTANT_BUFFER_15: + return 512 + 4096 * 15; + default: + return -1; + } } +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + if (ConstantBlock > -1) { + SDValue Result; + if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || + dyn_cast<Constant>(LoadNode->getSrcValue())) { + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); + } else { + // non constant ptr cant be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)) + ); + } + + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, MVT::i32)); + } + + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, 2, DL); + } + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2]; + Ops[0] = LoweredLoad; + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); +} SDValue R600TargetLowering::LowerFPOW(SDValue Op, SelectionDAG &DAG) const { @@ -873,7 +1039,7 @@ SDValue R600TargetLowering::LowerFormalArguments( AMDGPUAS::PARAM_I_ADDRESS); SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), DAG.getConstant(ParamOffsetBytes, MVT::i32), - MachinePointerInfo(new Argument(PtrTy)), + MachinePointerInfo(UndefValue::get(PtrTy)), ArgVT, false, false, ArgBytes); InVals.push_back(Arg); ParamOffsetBytes += ArgBytes; @@ -904,6 +1070,121 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } break; } + + // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> + // (i32 select_cc f32, f32, -1, 0 cc) + // + // Mesa's GLSL frontend generates the above pattern a lot and we can lower + // this to one of the SET*_DX10 instructions. + case ISD::FP_TO_SINT: { + SDValue FNeg = N->getOperand(0); + if (FNeg.getOpcode() != ISD::FNEG) { + return SDValue(); + } + SDValue SelectCC = FNeg.getOperand(0); + if (SelectCC.getOpcode() != ISD::SELECT_CC || + SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS + SelectCC.getOperand(2).getValueType() != MVT::f32 || // True + !isHWTrueValue(SelectCC.getOperand(2)) || + !isHWFalseValue(SelectCC.getOperand(3))) { + return SDValue(); + } + + return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0), + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getConstant(-1, MVT::i32), // True + DAG.getConstant(0, MVT::i32), // Flase + SelectCC.getOperand(4)); // CC + + break; + } + // Extract_vec (Build_vector) generated by custom lowering + // also needs to be customly combined + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return Arg->getOperand(Element); + } + } + if (Arg.getOpcode() == ISD::BITCAST && + Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(), + Arg->getOperand(0).getOperand(Element)); + } + } + } + + case ISD::SELECT_CC: { + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> + // selectcc x, y, a, b, inv(cc) + SDValue LHS = N->getOperand(0); + if (LHS.getOpcode() != ISD::SELECT_CC) { + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + + if (LHS.getOperand(2).getNode() != True.getNode() || + LHS.getOperand(3).getNode() != False.getNode() || + RHS.getNode() != False.getNode() || + cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) { + return SDValue(); + } + + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get(); + CCOpcode = ISD::getSetCCInverse( + CCOpcode, LHS.getOperand(0).getValueType().isInteger()); + return DAG.getSelectCC(N->getDebugLoc(), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + CCOpcode); + } + case AMDGPUISD::EXPORT: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + SDValue NewBldVec[4] = { + DAG.getUNDEF(MVT::f32), + DAG.getUNDEF(MVT::f32), + DAG.getUNDEF(MVT::f32), + DAG.getUNDEF(MVT::f32) + }; + SDValue NewArgs[8] = { + N->getOperand(0), // Chain + SDValue(), + N->getOperand(2), // ArrayBase + N->getOperand(3), // Type + N->getOperand(4), // SWZ_X + N->getOperand(5), // SWZ_Y + N->getOperand(6), // SWZ_Z + N->getOperand(7) // SWZ_W + }; + for (unsigned i = 0; i < Arg.getNumOperands(); i++) { + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) { + if (C->isZero()) { + NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0 + } else if (C->isExactlyValue(1.0)) { + NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0 + } else { + NewBldVec[i] = Arg.getOperand(i); + } + } else { + NewBldVec[i] = Arg.getOperand(i); + } + } + DebugLoc DL = N->getDebugLoc(); + NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4); + return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); + } } return SDValue(); } |