diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 340 |
1 files changed, 226 insertions, 114 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1c3b9ae..e6858bc 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -85,6 +85,11 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) * ElemsPerChunk); + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -181,9 +186,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass i32 with i8 on Atom when compiling with O2 - if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) + // Bypass expensive divides on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { addBypassSlowDiv(32, 8); + if (Subtarget->is64Bit()) + addBypassSlowDiv(64, 16); + } if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. @@ -368,7 +376,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); - setOperationAction(ISD::BR_CC , MVT::Other, Expand); + setOperationAction(ISD::BR_CC , MVT::f32, Expand); + setOperationAction(ISD::BR_CC , MVT::f64, Expand); + setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::i8, Expand); + setOperationAction(ISD::BR_CC , MVT::i16, Expand); + setOperationAction(ISD::BR_CC , MVT::i32, Expand); + setOperationAction(ISD::BR_CC , MVT::i64, Expand); setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); @@ -4956,7 +4970,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, DAG.getConstant(NumBits, - TLI.getShiftAmountTy(SrcOp.getValueType())))); + TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); } SDValue @@ -7820,7 +7834,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Chain.getValue(1)); } - if (Subtarget->isTargetWindows()) { + if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -7840,18 +7854,19 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or - // %gs:0x58 (64-bit). + // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly + // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); - SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, - Subtarget->is64Bit() - ? DAG.getIntPtrConstant(0x58) - : DAG.getExternalSymbol("_tls_array", - getPointerTy()), + SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : + (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : + DAG.getExternalSymbol("_tls_array", getPointerTy())); + + SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, false, false, 0); @@ -12248,7 +12263,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::UINT_TO_FP: { - if (N->getOperand(0).getValueType() != MVT::v2i32 && + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (N->getOperand(0).getValueType() != MVT::v2i32 || N->getValueType(0) != MVT::v2f32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, @@ -12890,13 +12906,16 @@ static unsigned getPseudoCMOVOpc(EVT VT) { // to // // ... -// EAX = LOAD MI.addr +// t1 = LOAD MI.addr // loop: -// t1 = OP MI.val, EAX -// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] +// t4 = phi(t1, t3 / loop) +// t2 = OP MI.val, t4 +// EAX = t4 +// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] +// t3 = EAX // JNE loop // sink: -// dst = EAX +// dst = t3 // ... MachineBasicBlock * X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, @@ -12933,7 +12952,11 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, const TargetRegisterClass *RC = MRI.getRegClass(DstReg); MVT::SimpleValueType VT = *RC->vt_begin(); - unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); + unsigned t1 = MRI.createVirtualRegister(RC); + unsigned t2 = MRI.createVirtualRegister(RC); + unsigned t3 = MRI.createVirtualRegister(RC); + unsigned t4 = MRI.createVirtualRegister(RC); + unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); unsigned LOADOpc = getLoadOpcode(VT); @@ -12941,12 +12964,16 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, // For the atomic load-arith operator, we generate // // thisMBB: - // EAX = LOAD [MI.addr] + // t1 = LOAD [MI.addr] // mainMBB: + // t4 = phi(t1 / thisMBB, t3 / mainMBB) // t1 = OP MI.val, EAX + // EAX = t4 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] + // t3 = EAX // JNE mainMBB // sinkMBB: + // dst = t3 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); @@ -12962,23 +12989,34 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { + unsigned flags = (*MMOI)->getFlags(); + flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; + MachineMemOperand *MMO = + MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, + (*MMOI)->getSize(), + (*MMOI)->getBaseAlignment(), + (*MMOI)->getTBAAInfo(), + (*MMOI)->getRanges()); + MIB.addMemOperand(MMO); + } thisMBB->addSuccessor(mainMBB); // mainMBB: MachineBasicBlock *origMainMBB = mainMBB; - mainMBB->addLiveIn(AccPhyReg); - // Copy AccPhyReg as it is used more than once. - unsigned AccReg = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) - .addReg(AccPhyReg); + // Add a PHI. + MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - unsigned t1 = MRI.createVirtualRegister(RC); unsigned Opc = MI->getOpcode(); switch (Opc) { default: @@ -12996,20 +13034,20 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, case X86::ATOMXOR32: case X86::ATOMXOR64: { unsigned ARITHOpc = getNonAtomicOpcode(Opc); - BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) - .addReg(AccReg); + BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) + .addReg(t4); break; } case X86::ATOMNAND8: case X86::ATOMNAND16: case X86::ATOMNAND32: case X86::ATOMNAND64: { - unsigned t2 = MRI.createVirtualRegister(RC); + unsigned Tmp = MRI.createVirtualRegister(RC); unsigned NOTOpc; unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); - BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) - .addReg(AccReg); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); + BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) + .addReg(t4); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); break; } case X86::ATOMMAX8: @@ -13033,20 +13071,22 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, BuildMI(mainMBB, DL, TII->get(CMPOpc)) .addReg(SrcReg) - .addReg(AccReg); + .addReg(t4); if (Subtarget->hasCMov()) { if (VT != MVT::i8) { // Native support - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) .addReg(SrcReg) - .addReg(AccReg); + .addReg(t4); } else { // Promote i8 to i32 to use CMOV32 - const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterClass *RC32 = + TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); unsigned SrcReg32 = MRI.createVirtualRegister(RC32); unsigned AccReg32 = MRI.createVirtualRegister(RC32); - unsigned t2 = MRI.createVirtualRegister(RC32); + unsigned Tmp = MRI.createVirtualRegister(RC32); unsigned Undef = MRI.createVirtualRegister(RC32); BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); @@ -13057,15 +13097,15 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, .addImm(X86::sub_8bit); BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) .addReg(Undef) - .addReg(AccReg) + .addReg(t4) .addImm(X86::sub_8bit); - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) + BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) .addReg(SrcReg32) .addReg(AccReg32); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) - .addReg(t2, 0, X86::sub_8bit); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) + .addReg(Tmp, 0, X86::sub_8bit); } } else { // Use pseudo select and lower them. @@ -13074,36 +13114,47 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, unsigned SelOpc = getPseudoCMOVOpc(VT); X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); - MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) - .addReg(SrcReg).addReg(AccReg) + MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) + .addReg(SrcReg).addReg(t4) .addImm(CC); mainMBB = EmitLoweredSelect(MIB, mainMBB); + // Replace the original PHI node as mainMBB is changed after CMOV + // lowering. + BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); + Phi->eraseFromParent(); } break; } } - // Copy AccPhyReg back from virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) - .addReg(AccReg); + // Copy PhyReg back from virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) + .addReg(t4); MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); - MIB.addReg(t1); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + MIB.addReg(t2); MIB.setMemRefs(MMOBegin, MMOEnd); + // Copy PhyReg back to virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) + .addReg(PhyReg); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); mainMBB->addSuccessor(origMainMBB); mainMBB->addSuccessor(sinkMBB); // sinkMBB: - sinkMBB->addLiveIn(AccPhyReg); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg) - .addReg(AccPhyReg); + .addReg(t3); MI->eraseFromParent(); return sinkMBB; @@ -13120,15 +13171,24 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, // to // // ... -// EAX = LOAD [MI.addr + 0] -// EDX = LOAD [MI.addr + 4] +// t1L = LOAD [MI.addr + 0] +// t1H = LOAD [MI.addr + 4] // loop: -// EBX = OP MI.val.lo, EAX -// ECX = OP MI.val.hi, EDX +// t4L = phi(t1L, t3L / loop) +// t4H = phi(t1H, t3H / loop) +// t2L = OP MI.val.lo, t4L +// t2H = OP MI.val.hi, t4H +// EAX = t4L +// EDX = t4H +// EBX = t2L +// ECX = t2H // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] +// t3L = EAX +// t3H = EDX // JNE loop // sink: -// dst = EDX:EAX +// dstL = t3L +// dstH = t3H // ... MachineBasicBlock * X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, @@ -13169,20 +13229,37 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, const TargetRegisterClass *RC = &X86::GR32RegClass; const TargetRegisterClass *RC8 = &X86::GR8RegClass; + unsigned t1L = MRI.createVirtualRegister(RC); + unsigned t1H = MRI.createVirtualRegister(RC); + unsigned t2L = MRI.createVirtualRegister(RC); + unsigned t2H = MRI.createVirtualRegister(RC); + unsigned t3L = MRI.createVirtualRegister(RC); + unsigned t3H = MRI.createVirtualRegister(RC); + unsigned t4L = MRI.createVirtualRegister(RC); + unsigned t4H = MRI.createVirtualRegister(RC); + unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; unsigned LOADOpc = X86::MOV32rm; // For the atomic load-arith operator, we generate // // thisMBB: - // EAX = LOAD [MI.addr + 0] - // EDX = LOAD [MI.addr + 4] + // t1L = LOAD [MI.addr + 0] + // t1H = LOAD [MI.addr + 4] // mainMBB: - // EBX = OP MI.vallo, EAX - // ECX = OP MI.valhi, EDX + // t4L = phi(t1L / thisMBB, t3L / mainMBB) + // t4H = phi(t1H / thisMBB, t3H / mainMBB) + // t2L = OP MI.val.lo, t4L + // t2H = OP MI.val.hi, t4H + // EBX = t2L + // ECX = t2H // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] - // JNE mainMBB + // t3L = EAX + // t3H = EDX + // JNE loop // sinkMBB: + // dstL = t3L + // dstH = t3H MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); @@ -13199,35 +13276,50 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, // thisMBB: // Lo - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { + unsigned flags = (*MMOI)->getFlags(); + flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; + MachineMemOperand *MMO = + MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, + (*MMOI)->getSize(), + (*MMOI)->getBaseAlignment(), + (*MMOI)->getTBAAInfo(), + (*MMOI)->getRanges()); + MIB.addMemOperand(MMO); + }; + MachineInstr *LowMI = MIB; + // Hi - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) + if (i == X86::AddrDisp) { MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) - else - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } else { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } } - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); thisMBB->addSuccessor(mainMBB); // mainMBB: MachineBasicBlock *origMainMBB = mainMBB; - mainMBB->addLiveIn(X86::EAX); - mainMBB->addLiveIn(X86::EDX); - - // Copy EDX:EAX as they are used more than once. - unsigned LoReg = MRI.createVirtualRegister(RC); - unsigned HiReg = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); - unsigned t1L = MRI.createVirtualRegister(RC); - unsigned t1H = MRI.createVirtualRegister(RC); + // Add PHIs. + MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) + .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); + MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) + .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); unsigned Opc = MI->getOpcode(); switch (Opc) { @@ -13240,19 +13332,23 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, case X86::ATOMSUB6432: { unsigned HiOpc; unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) + .addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) + .addReg(SrcHiReg); break; } case X86::ATOMNAND6432: { unsigned HiOpc, NOTOpc; unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); - unsigned t2L = MRI.createVirtualRegister(RC); - unsigned t2H = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); + unsigned TmpL = MRI.createVirtualRegister(RC); + unsigned TmpH = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) + .addReg(t4L); + BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) + .addReg(t4H); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); break; } case X86::ATOMMAX6432: @@ -13268,12 +13364,12 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, unsigned cc = MRI.createVirtualRegister(RC); // cl := cmp src_lo, lo BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcLoReg).addReg(LoReg); + .addReg(SrcLoReg).addReg(t4L); BuildMI(mainMBB, DL, TII->get(LoOpc), cL); BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); // ch := cmp src_hi, hi BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcHiReg).addReg(HiReg); + .addReg(SrcHiReg).addReg(t4H); BuildMI(mainMBB, DL, TII->get(HiOpc), cH); BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); // cc := if (src_hi == hi) ? cl : ch; @@ -13288,58 +13384,74 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, } BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) - .addReg(SrcLoReg).addReg(LoReg); - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) - .addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) + .addReg(SrcLoReg).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) + .addReg(SrcHiReg).addReg(t4H); } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) - .addReg(SrcLoReg).addReg(LoReg) + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) + .addReg(SrcLoReg).addReg(t4L) .addImm(X86::COND_NE); mainMBB = EmitLoweredSelect(MIB, mainMBB); - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) - .addReg(SrcHiReg).addReg(HiReg) + // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the + // 2nd CMOV lowering. + mainMBB->addLiveIn(X86::EFLAGS); + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) + .addReg(SrcHiReg).addReg(t4H) .addImm(X86::COND_NE); mainMBB = EmitLoweredSelect(MIB, mainMBB); + // Replace the original PHI node as mainMBB is changed after CMOV + // lowering. + BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) + .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); + BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) + .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); + PhiL->eraseFromParent(); + PhiH->eraseFromParent(); } break; } case X86::ATOMSWAP6432: { unsigned HiOpc; unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); break; } } // Copy EDX:EAX back from HiReg:LoReg - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); // Copy ECX:EBX from t1H:t1L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } MIB.setMemRefs(MMOBegin, MMOEnd); + // Copy EDX:EAX back to t3H:t3L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); mainMBB->addSuccessor(origMainMBB); mainMBB->addSuccessor(sinkMBB); // sinkMBB: - sinkMBB->addLiveIn(X86::EAX); - sinkMBB->addLiveIn(X86::EDX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), DstLoReg) - .addReg(X86::EAX); + .addReg(t3L); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), DstHiReg) - .addReg(X86::EDX); + .addReg(t3H); MI->eraseFromParent(); return sinkMBB; |