diff options
-rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.cpp | 129 | ||||
-rw-r--r-- | test/CodeGen/PowerPC/unal-altivec.ll | 45 |
2 files changed, 174 insertions, 0 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 95a05ef..b0a684e 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -536,6 +536,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); setTargetDAGCombine(ISD::BSWAP); @@ -5070,6 +5071,16 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res); } +/// BuildIntrinsicOp - Return a unary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, + SelectionDAG &DAG, DebugLoc dl, + EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, MVT::i32), Op); +} + /// BuildIntrinsicOp - Return a binary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, @@ -6946,6 +6957,124 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, cast<StoreSDNode>(N)->getMemOperand()); } break; + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT VT = LD->getValueType(0); + Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + if (ISD::isNON_EXTLoad(N) && VT.isVector() && + TM.getSubtarget<PPCSubtarget>().hasAltivec() && + DCI.getDAGCombineLevel() == AfterLegalizeTypes && + LD->getAlignment() < ABIAlignment) { + // This is a type-legal unaligned Altivec load. + SDValue Chain = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + + // This implements the loading of unaligned vectors as described in + // the venerable Apple Velocity Engine overview. Specifically: + // https://developer.apple.com/hardwaredrivers/ve/alignment.html + // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html + // + // The general idea is to expand a sequence of one or more unaligned + // loads into a alignment-based permutation-control instruction (lvsl), + // a series of regular vector loads (which always truncate their + // input address to an aligned address), and a series of permutations. + // The results of these permutations are the requested loaded values. + // The trick is that the last "extra" load is not taken from the address + // you might suspect (sizeof(vector) bytes after the last requested + // load), but rather sizeof(vector) - 1 bytes after the last + // requested vector. The point of this is to avoid a page fault if the + // base address happend to be aligned. This works because if the base + // address is aligned, then adding less than a full vector length will + // cause the last vector in the sequence to be (re)loaded. Otherwise, + // the next vector will be fetched as you might suspect was necessary. + + // FIXME: We might be able to reuse the permutation generation from + // a different base address offset from this one by an aligned amount. + SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr, + DAG, dl, MVT::v16i8); + + // Refine the alignment of the original load (a "new" load created here + // which was identical to the first except for the alignment would be + // merged with the existing node regardless). + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(LD->getPointerInfo(), + LD->getMemOperand()->getFlags(), + LD->getMemoryVT().getStoreSize(), + ABIAlignment); + LD->refineAlignment(MMO); + SDValue BaseLoad = SDValue(LD, 0); + + // Note that the value of IncOffset (which is provided to the next + // load's pointer info offset value, and thus used to calculate the + // alignment), and the value of IncValue (which is actually used to + // increment the pointer value) are different! This is because we + // require the next load to appear to be aligned, even though it + // is actually offset from the base pointer by a lesser amount. + int IncOffset = VT.getSizeInBits() / 8; + int IncValue = IncOffset - 1; + SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + + // FIXME: We might have another load (with a slightly-different + // real offset) that we can reuse here. + SDValue ExtraLoad = + DAG.getLoad(VT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(IncOffset), + LD->isVolatile(), LD->isNonTemporal(), + LD->isInvariant(), ABIAlignment); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + BaseLoad.getValue(1), ExtraLoad.getValue(1)); + + if (BaseLoad.getValueType() != MVT::v4i32) + BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); + + if (ExtraLoad.getValueType() != MVT::v4i32) + ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); + + SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); + + if (VT != MVT::v4i32) + Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); + + // Now we need to be really careful about how we update the users of the + // original load. We cannot just call DCI.CombineTo (or + // DAG.ReplaceAllUsesWith for that matter), because the load still has + // uses created here (the permutation for example) that need to stay. + SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + while (UI != UE) { + SDUse &Use = UI.getUse(); + SDNode *User = *UI; + // Note: BaseLoad is checked here because it might not be N, but a + // bitcast of N. + if (User == Perm.getNode() || User == BaseLoad.getNode() || + User == TF.getNode() || Use.getResNo() > 1) { + ++UI; + continue; + } + + SDValue To = Use.getResNo() ? TF : Perm; + ++UI; + + SmallVector<SDValue, 8> Ops; + for (SDNode::op_iterator O = User->op_begin(), + OE = User->op_end(); O != OE; ++O) { + if (*O == Use) + Ops.push_back(To); + else + Ops.push_back(*O); + } + + DAG.UpdateNodeOperands(User, Ops.data(), Ops.size()); + } + + return SDValue(N, 0); + } + } + break; case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && diff --git a/test/CodeGen/PowerPC/unal-altivec.ll b/test/CodeGen/PowerPC/unal-altivec.ll new file mode 100644 index 0000000..f89f299 --- /dev/null +++ b/test/CodeGen/PowerPC/unal-altivec.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -mcpu=g5 | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define void @foo(float* noalias nocapture %a, float* noalias nocapture %b) #0 { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float* %b, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>* %1, align 4 + %.sum11 = or i64 %index, 4 + %2 = getelementptr float* %b, i64 %.sum11 + %3 = bitcast float* %2 to <4 x float>* + %wide.load8 = load <4 x float>* %3, align 4 + %4 = fadd <4 x float> %wide.load, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> + %5 = fadd <4 x float> %wide.load8, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> + %6 = getelementptr inbounds float* %a, i64 %index + %7 = bitcast float* %6 to <4 x float>* + store <4 x float> %4, <4 x float>* %7, align 4 + %.sum12 = or i64 %index, 4 + %8 = getelementptr float* %a, i64 %.sum12 + %9 = bitcast float* %8 to <4 x float>* + store <4 x float> %5, <4 x float>* %9, align 4 + %index.next = add i64 %index, 8 + %10 = icmp eq i64 %index.next, 16000 + br i1 %10, label %for.end, label %vector.body + +; CHECK: @foo +; CHECK: lvx [[CNST:[0-9]+]], +; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[B2:[0-9]+]] +; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[B2]] +; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[B2]] +; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]], +; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]] +; CHECK: vaddfp {{[0-9]+}}, [[R1]], [[CNST]] +; CHECK: blr + +for.end: ; preds = %vector.body + ret void +} + +attributes #0 = { nounwind } |