diff options
Diffstat (limited to 'lib/Target')
119 files changed, 5857 insertions, 1726 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 29033e5..e2f0d7d 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -683,7 +683,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Handle register classes that require multiple instructions. unsigned BeginIdx = 0; unsigned SubRegs = 0; - unsigned Spacing = 1; + int Spacing = 1; // Use VORRq when possible. if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) @@ -705,27 +705,38 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2; - if (Opc) { - const TargetRegisterInfo *TRI = &getRegisterInfo(); - MachineInstrBuilder Mov; - for (unsigned i = 0; i != SubRegs; ++i) { - unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing); - unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i*Spacing); - assert(Dst && Src && "Bad sub-register"); - Mov = AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst) - .addReg(Src)); - // VORR takes two source operands. - if (Opc == ARM::VORRq) - Mov.addReg(Src); - } - // Add implicit super-register defs and kills to the last instruction. - Mov->addRegisterDefined(DestReg, TRI); - if (KillSrc) - Mov->addRegisterKilled(SrcReg, TRI); - return; - } + assert(Opc && "Impossible reg-to-reg copy"); - llvm_unreachable("Impossible reg-to-reg copy"); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineInstrBuilder Mov; + + // Copy register tuples backward when the first Dest reg overlaps with SrcReg. + if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) { + BeginIdx = BeginIdx + ((SubRegs-1)*Spacing); + Spacing = -Spacing; + } +#ifndef NDEBUG + SmallSet<unsigned, 4> DstRegs; +#endif + for (unsigned i = 0; i != SubRegs; ++i) { + unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing); + unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i*Spacing); + assert(Dst && Src && "Bad sub-register"); +#ifndef NDEBUG + assert(!DstRegs.count(Src) && "destructive vector copy"); + DstRegs.insert(Dst); +#endif + Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst) + .addReg(Src); + // VORR takes two source operands. + if (Opc == ARM::VORRq) + Mov.addReg(Src); + Mov = AddDefaultPred(Mov); + } + // Add implicit super-register defs and kills to the last instruction. + Mov->addRegisterDefined(DestReg, TRI); + if (KillSrc) + Mov->addRegisterKilled(SrcReg, TRI); } static const @@ -1569,16 +1580,20 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } /// Identify instructions that can be folded into a MOVCC instruction, and -/// return the corresponding opcode for the predicated pseudo-instruction. -static unsigned canFoldIntoMOVCC(unsigned Reg, MachineInstr *&MI, - const MachineRegisterInfo &MRI) { +/// return the defining instruction. +static MachineInstr *canFoldIntoMOVCC(unsigned Reg, + const MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) { if (!TargetRegisterInfo::isVirtualRegister(Reg)) return 0; if (!MRI.hasOneNonDBGUse(Reg)) return 0; - MI = MRI.getVRegDef(Reg); + MachineInstr *MI = MRI.getVRegDef(Reg); if (!MI) return 0; + // MI is folded into the MOVCC by predicating it. + if (!MI->isPredicable()) + return 0; // Check if MI has any non-dead defs or physreg uses. This also detects // predicated instructions which will be reading CPSR. for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { @@ -1588,55 +1603,18 @@ static unsigned canFoldIntoMOVCC(unsigned Reg, MachineInstr *&MI, return 0; if (!MO.isReg()) continue; + // MI can't have any tied operands, that would conflict with predication. + if (MO.isTied()) + return 0; if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) return 0; if (MO.isDef() && !MO.isDead()) return 0; } - switch (MI->getOpcode()) { - default: return 0; - case ARM::ANDri: return ARM::ANDCCri; - case ARM::ANDrr: return ARM::ANDCCrr; - case ARM::ANDrsi: return ARM::ANDCCrsi; - case ARM::ANDrsr: return ARM::ANDCCrsr; - case ARM::t2ANDri: return ARM::t2ANDCCri; - case ARM::t2ANDrr: return ARM::t2ANDCCrr; - case ARM::t2ANDrs: return ARM::t2ANDCCrs; - case ARM::EORri: return ARM::EORCCri; - case ARM::EORrr: return ARM::EORCCrr; - case ARM::EORrsi: return ARM::EORCCrsi; - case ARM::EORrsr: return ARM::EORCCrsr; - case ARM::t2EORri: return ARM::t2EORCCri; - case ARM::t2EORrr: return ARM::t2EORCCrr; - case ARM::t2EORrs: return ARM::t2EORCCrs; - case ARM::ORRri: return ARM::ORRCCri; - case ARM::ORRrr: return ARM::ORRCCrr; - case ARM::ORRrsi: return ARM::ORRCCrsi; - case ARM::ORRrsr: return ARM::ORRCCrsr; - case ARM::t2ORRri: return ARM::t2ORRCCri; - case ARM::t2ORRrr: return ARM::t2ORRCCrr; - case ARM::t2ORRrs: return ARM::t2ORRCCrs; - - // ARM ADD/SUB - case ARM::ADDri: return ARM::ADDCCri; - case ARM::ADDrr: return ARM::ADDCCrr; - case ARM::ADDrsi: return ARM::ADDCCrsi; - case ARM::ADDrsr: return ARM::ADDCCrsr; - case ARM::SUBri: return ARM::SUBCCri; - case ARM::SUBrr: return ARM::SUBCCrr; - case ARM::SUBrsi: return ARM::SUBCCrsi; - case ARM::SUBrsr: return ARM::SUBCCrsr; - - // Thumb2 ADD/SUB - case ARM::t2ADDri: return ARM::t2ADDCCri; - case ARM::t2ADDri12: return ARM::t2ADDCCri12; - case ARM::t2ADDrr: return ARM::t2ADDCCrr; - case ARM::t2ADDrs: return ARM::t2ADDCCrs; - case ARM::t2SUBri: return ARM::t2SUBCCri; - case ARM::t2SUBri12: return ARM::t2SUBCCri12; - case ARM::t2SUBrr: return ARM::t2SUBCCrr; - case ARM::t2SUBrs: return ARM::t2SUBCCrs; - } + bool DontMoveAcrossStores = true; + if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores)) + return 0; + return MI; } bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI, @@ -1665,19 +1643,18 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) && "Unknown select instruction"); const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - MachineInstr *DefMI = 0; - unsigned Opc = canFoldIntoMOVCC(MI->getOperand(2).getReg(), DefMI, MRI); - bool Invert = !Opc; - if (!Opc) - Opc = canFoldIntoMOVCC(MI->getOperand(1).getReg(), DefMI, MRI); - if (!Opc) + MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this); + bool Invert = !DefMI; + if (!DefMI) + DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this); + if (!DefMI) return 0; // Create a new predicated version of DefMI. // Rfalse is the first use. MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - get(Opc), MI->getOperand(0).getReg()) - .addOperand(MI->getOperand(Invert ? 2 : 1)); + DefMI->getDesc(), + MI->getOperand(0).getReg()); // Copy all the DefMI operands, excluding its (null) predicate. const MCInstrDesc &DefDesc = DefMI->getDesc(); @@ -1696,6 +1673,15 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, if (NewMI->hasOptionalDef()) AddDefaultCC(NewMI); + // The output register value when the predicate is false is an implicit + // register operand tied to the first def. + // The tie makes the register allocator ensure the FalseReg is allocated the + // same register as operand 0. + MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1); + FalseReg.setImplicit(); + NewMI->addOperand(FalseReg); + NewMI->tieOperands(0, NewMI->getNumOperands() - 1); + // The caller will erase MI, but not DefMI. DefMI->eraseFromParent(); return NewMI; @@ -2042,13 +2028,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // Masked compares sometimes use the same register as the corresponding 'and'. if (CmpMask != ~0) { - if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) { + if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) { MI = 0; for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg), UE = MRI->use_end(); UI != UE; ++UI) { if (UI->getParent() != CmpInstr->getParent()) continue; MachineInstr *PotentialAND = &*UI; - if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true)) + if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) || + isPredicated(PotentialAND)) continue; MI = PotentialAND; break; @@ -2114,6 +2101,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // The single candidate is called MI. if (!MI) MI = Sub; + // We can't use a predicated instruction - it doesn't always write the flags. + if (isPredicated(MI)) + return false; + switch (MI->getOpcode()) { default: break; case ARM::RSBrr: @@ -2220,6 +2211,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // Toggle the optional operand to CPSR. MI->getOperand(5).setReg(ARM::CPSR); MI->getOperand(5).setIsDef(true); + assert(!isPredicated(MI) && "Can't use flags from predicated instruction"); CmpInstr->eraseFromParent(); // Modify the condition code of operands in OperandsToUpdate. @@ -3366,7 +3358,8 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { // converted. if (Subtarget.isCortexA9() && !isPredicated(MI) && (MI->getOpcode() == ARM::VMOVRS || - MI->getOpcode() == ARM::VMOVSR)) + MI->getOpcode() == ARM::VMOVSR || + MI->getOpcode() == ARM::VMOVS)) return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); // No other instructions can be swizzled, so just determine their domain. @@ -3386,13 +3379,28 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { return std::make_pair(ExeGeneric, 0); } +static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI, + unsigned SReg, unsigned &Lane) { + unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass); + Lane = 0; + + if (DReg != ARM::NoRegister) + return DReg; + + Lane = 1; + DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass); + + assert(DReg && "S-register with no D super-register?"); + return DReg; +} + + void ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { unsigned DstReg, SrcReg, DReg; unsigned Lane; MachineInstrBuilder MIB(MI); const TargetRegisterInfo *TRI = &getRegisterInfo(); - bool isKill; switch (MI->getOpcode()) { default: llvm_unreachable("cannot handle opcode!"); @@ -3403,78 +3411,175 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // Zap the predicate operands. assert(!isPredicated(MI) && "Cannot predicate a VORRd"); - MI->RemoveOperand(3); - MI->RemoveOperand(2); - // Change to a VORRd which requires two identical use operands. - MI->setDesc(get(ARM::VORRd)); + // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); - // Add the extra source operand and new predicates. - // This will go before any implicit ops. - AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1))); + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits) + MI->setDesc(get(ARM::VORRd)); + AddDefaultPred(MIB.addReg(DstReg, RegState::Define) + .addReg(SrcReg) + .addReg(SrcReg)); break; case ARM::VMOVRS: if (Domain != ExeNEON) break; assert(!isPredicated(MI) && "Cannot predicate a VGETLN"); + // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits) DstReg = MI->getOperand(0).getReg(); SrcReg = MI->getOperand(1).getReg(); - DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_0, &ARM::DPRRegClass); - Lane = 0; - if (DReg == ARM::NoRegister) { - DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_1, &ARM::DPRRegClass); - Lane = 1; - assert(DReg && "S-register with no D super-register?"); - } + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); - MI->RemoveOperand(3); - MI->RemoveOperand(2); - MI->RemoveOperand(1); + DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane); + // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps) + // Note that DSrc has been widened and the other lane may be undef, which + // contaminates the entire register. MI->setDesc(get(ARM::VGETLNi32)); - MIB.addReg(DReg); - MIB.addImm(Lane); + AddDefaultPred(MIB.addReg(DstReg, RegState::Define) + .addReg(DReg, RegState::Undef) + .addImm(Lane)); - MIB->getOperand(1).setIsUndef(); + // The old source should be an implicit use, otherwise we might think it + // was dead before here. MIB.addReg(SrcReg, RegState::Implicit); - - AddDefaultPred(MIB); break; case ARM::VMOVSR: if (Domain != ExeNEON) break; assert(!isPredicated(MI) && "Cannot predicate a VSETLN"); + // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits) DstReg = MI->getOperand(0).getReg(); SrcReg = MI->getOperand(1).getReg(); - DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_0, &ARM::DPRRegClass); - Lane = 0; - if (DReg == ARM::NoRegister) { - DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_1, &ARM::DPRRegClass); - Lane = 1; - assert(DReg && "S-register with no D super-register?"); - } - isKill = MI->getOperand(0).isKill(); - MI->RemoveOperand(3); - MI->RemoveOperand(2); - MI->RemoveOperand(1); - MI->RemoveOperand(0); + DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane); + // If we insert both a novel <def> and an <undef> on the DReg, we break + // any existing dependency chain on the unused lane. Either already being + // present means this instruction is in that chain anyway so we can make + // the transformation. + if (!MI->definesRegister(DReg, TRI) && !MI->readsRegister(DReg, TRI)) + break; + + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps) + // Again DDst may be undefined at the beginning of this instruction. MI->setDesc(get(ARM::VSETLNi32)); - MIB.addReg(DReg, RegState::Define); - MIB.addReg(DReg, RegState::Undef); - MIB.addReg(SrcReg); - MIB.addImm(Lane); + MIB.addReg(DReg, RegState::Define) + .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI))) + .addReg(SrcReg) + .addImm(Lane); + AddDefaultPred(MIB); - if (isKill) - MIB->addRegisterKilled(DstReg, TRI, true); - MIB->addRegisterDefined(DstReg, TRI); + // The narrower destination must be marked as set to keep previous chains + // in place. + MIB.addReg(DstReg, RegState::Define | RegState::Implicit); + break; + case ARM::VMOVS: { + if (Domain != ExeNEON) + break; + // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + + unsigned DstLane = 0, SrcLane = 0, DDst, DSrc; + DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane); + DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane); + + // If we insert both a novel <def> and an <undef> on the DReg, we break + // any existing dependency chain on the unused lane. Either already being + // present means this instruction is in that chain anyway so we can make + // the transformation. + if (!MI->definesRegister(DDst, TRI) && !MI->readsRegister(DDst, TRI)) + break; + + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + if (DSrc == DDst) { + // Destination can be: + // %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits) + MI->setDesc(get(ARM::VDUPLN32d)); + MIB.addReg(DDst, RegState::Define) + .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI))) + .addImm(SrcLane); + AddDefaultPred(MIB); + + // Neither the source or the destination are naturally represented any + // more, so add them in manually. + MIB.addReg(DstReg, RegState::Implicit | RegState::Define); + MIB.addReg(SrcReg, RegState::Implicit); + break; + } + + // In general there's no single instruction that can perform an S <-> S + // move in NEON space, but a pair of VEXT instructions *can* do the + // job. It turns out that the VEXTs needed will only use DSrc once, with + // the position based purely on the combination of lane-0 and lane-1 + // involved. For example + // vmov s0, s2 -> vext.32 d0, d0, d1, #1 vext.32 d0, d0, d0, #1 + // vmov s1, s3 -> vext.32 d0, d1, d0, #1 vext.32 d0, d0, d0, #1 + // vmov s0, s3 -> vext.32 d0, d0, d0, #1 vext.32 d0, d1, d0, #1 + // vmov s1, s2 -> vext.32 d0, d0, d0, #1 vext.32 d0, d0, d1, #1 + // + // Pattern of the MachineInstrs is: + // %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits) + MachineInstrBuilder NewMIB; + NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + get(ARM::VEXTd32), DDst); + + // On the first instruction, both DSrc and DDst may be <undef> if present. + // Specifically when the original instruction didn't have them as an + // <imp-use>. + unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst; + bool CurUndef = !MI->readsRegister(CurReg, TRI); + NewMIB.addReg(CurReg, getUndefRegState(CurUndef)); + + CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst; + CurUndef = !MI->readsRegister(CurReg, TRI); + NewMIB.addReg(CurReg, getUndefRegState(CurUndef)); + + NewMIB.addImm(1); + AddDefaultPred(NewMIB); + + if (SrcLane == DstLane) + NewMIB.addReg(SrcReg, RegState::Implicit); + + MI->setDesc(get(ARM::VEXTd32)); + MIB.addReg(DDst, RegState::Define); + + // On the second instruction, DDst has definitely been defined above, so + // it is not <undef>. DSrc, if present, can be <undef> as above. + CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst; + CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI); + MIB.addReg(CurReg, getUndefRegState(CurUndef)); + + CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst; + CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI); + MIB.addReg(CurReg, getUndefRegState(CurUndef)); + + MIB.addImm(1); AddDefaultPred(MIB); + + if (SrcLane != DstLane) + MIB.addReg(SrcReg, RegState::Implicit); + + // As before, the original destination is no longer represented, add it + // implicitly. + MIB.addReg(DstReg, RegState::Define | RegState::Implicit); break; + } } } diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index 132b81f..89cbc84 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -410,7 +410,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) { do { DEBUG(errs() << "JITTing function '" - << MF.getFunction()->getName() << "'\n"); + << MF.getName() << "'\n"); MCE.startFunction(MF); for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index a953985..dd05f0c 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1388,10 +1388,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // If the original WaterList entry was "new water" on this iteration, // propagate that to the new island. This is just keeping NewWaterList // updated to match the WaterList, which will be updated below. - if (NewWaterList.count(WaterBB)) { - NewWaterList.erase(WaterBB); + if (NewWaterList.erase(WaterBB)) NewWaterList.insert(NewIsland); - } + // The new CPE goes before the following block (NewMBB). NewMBB = llvm::next(MachineFunction::iterator(WaterBB)); diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 15bb32e..8ed6b75 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1208,6 +1208,57 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, ExpandLaneOp(MBBI); return true; + case ARM::VSETLNi8Q: + case ARM::VSETLNi16Q: { + // Expand VSETLNs acting on a Q register to equivalent VSETLNs acting + // on the respective D register. + + unsigned QReg = MI.getOperand(1).getReg(); + unsigned QLane = MI.getOperand(3).getImm(); + + unsigned NewOpcode, DLane, DSubReg; + switch (Opcode) { + default: llvm_unreachable("Invalid opcode!"); + case ARM::VSETLNi8Q: + // 4 possible 8-bit lanes per DPR: + NewOpcode = ARM::VSETLNi8; + DLane = QLane % 8; + DSubReg = (QLane / 8) ? ARM::dsub_1 : ARM::dsub_0; + break; + case ARM::VSETLNi16Q: + // 4 possible 16-bit lanes per DPR. + NewOpcode = ARM::VSETLNi16; + DLane = QLane % 4; + DSubReg = (QLane / 4) ? ARM::dsub_1 : ARM::dsub_0; + break; + } + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpcode)); + + unsigned DReg = TRI->getSubReg(QReg, DSubReg); + + MIB.addReg(DReg, RegState::Define); // Output DPR + MIB.addReg(DReg); // Input DPR + MIB.addOperand(MI.getOperand(2)); // Input GPR + MIB.addImm(DLane); // Lane + + // Add the predicate operands. + MIB.addOperand(MI.getOperand(4)); + MIB.addOperand(MI.getOperand(5)); + + if (MI.getOperand(1).isKill()) // Add an implicit kill for the Q register. + MIB->addRegisterKilled(QReg, TRI, true); + // And an implicit def of the output register (which should always be the + // same as the input register). + MIB->addRegisterDefined(QReg, TRI); + + TransferImpOps(MI, MIB, MIB); + + MI.eraseFromParent(); + return true; + } + case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true; case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true; case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true; diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 5a5ca1b..045d904 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -617,10 +617,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) { if (VT != MVT::i32) return 0; Reloc::Model RelocM = TM.getRelocationModel(); - - // TODO: Need more magic for ARM PIC. - if (!isThumb2 && (RelocM == Reloc::PIC_)) return 0; - + bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM); unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); // Use movw+movt when possible, it avoids constant pool entries. @@ -668,17 +665,30 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) { .addConstantPoolIndex(Idx); if (RelocM == Reloc::PIC_) MIB.addImm(Id); + AddOptionalDefs(MIB); } else { // The extra immediate is for addrmode2. MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp), DestReg) .addConstantPoolIndex(Idx) .addImm(0); + AddOptionalDefs(MIB); + + if (RelocM == Reloc::PIC_) { + unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD; + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc), NewDestReg) + .addReg(DestReg) + .addImm(Id); + AddOptionalDefs(MIB); + return NewDestReg; + } } - AddOptionalDefs(MIB); } - if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) { + if (IsIndirect) { MachineInstrBuilder MIB; unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); if (isThumb2) @@ -2212,25 +2222,17 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); - if (isThumb2) { - // Explicitly adding the predicate here. + // BL / BLX don't take a predicate, but tBL / tBLX do. + if (isThumb2) AddDefaultPred(MIB); - if (EnableARMLongCalls) - MIB.addReg(CalleeReg); - else - MIB.addExternalSymbol(TLI.getLibcallName(Call)); - } else { - if (EnableARMLongCalls) - MIB.addReg(CalleeReg); - else - MIB.addExternalSymbol(TLI.getLibcallName(Call)); + if (EnableARMLongCalls) + MIB.addReg(CalleeReg); + else + MIB.addExternalSymbol(TLI.getLibcallName(Call)); - // Explicitly adding the predicate here. - AddDefaultPred(MIB); - } // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i]); + MIB.addReg(RegArgs[i], RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). @@ -2358,30 +2360,20 @@ bool ARMFastISel::SelectCall(const Instruction *I, unsigned CallOpc = ARMSelectCallOp(UseReg); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); - if(isThumb2) { - // Explicitly adding the predicate here. - AddDefaultPred(MIB); - if (UseReg) - MIB.addReg(CalleeReg); - else if (!IntrMemName) - MIB.addGlobalAddress(GV, 0, 0); - else - MIB.addExternalSymbol(IntrMemName, 0); - } else { - if (UseReg) - MIB.addReg(CalleeReg); - else if (!IntrMemName) - MIB.addGlobalAddress(GV, 0, 0); - else - MIB.addExternalSymbol(IntrMemName, 0); - // Explicitly adding the predicate here. + // ARM calls don't take a predicate, but tBL / tBLX do. + if(isThumb2) AddDefaultPred(MIB); - } + if (UseReg) + MIB.addReg(CalleeReg); + else if (!IntrMemName) + MIB.addGlobalAddress(GV, 0, 0); + else + MIB.addExternalSymbol(IntrMemName, 0); // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i]); + MIB.addReg(RegArgs[i], RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). @@ -2650,7 +2642,7 @@ bool ARMFastISel::SelectShift(const Instruction *I, unsigned Reg1 = getRegForValue(Src1Value); if (Reg1 == 0) return false; - unsigned Reg2; + unsigned Reg2 = 0; if (Opc == ARM::MOVsr) { Reg2 = getRegForValue(Src2Value); if (Reg2 == 0) return false; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index c6f9d15..1406620 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2637,6 +2637,38 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { dl, MVT::i32, MVT::i32, Ops, 5); } } + case ARMISD::UMLAL:{ + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6); + }else{ + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::UMLAL : ARM::UMLALv5, + dl, MVT::i32, MVT::i32, Ops, 7); + } + } + case ARMISD::SMLAL:{ + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6); + }else{ + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::SMLAL : ARM::SMLALv5, + dl, MVT::i32, MVT::i32, Ops, 7); + } + } case ISD::LOAD: { SDNode *ResNode = 0; if (Subtarget->isThumb() && Subtarget->hasThumb2()) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index df4039b..e51315e 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -514,6 +514,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); setOperationAction(ISD::FEXP, MVT::v4f32, Expand); setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); @@ -566,6 +567,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } } + // ARM and Thumb2 support UMLAL/SMLAL. + if (!Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::ADDC); + + computeRegisterProperties(); // ARM does not have f32 extending load. @@ -791,12 +797,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); - - if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) { - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::XOR); - } + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); @@ -981,6 +984,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMLAL: return "ARMISD::UMLAL"; + case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; @@ -4154,10 +4159,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } // Scan through the operands to see if only one value is used. + // + // As an optimisation, even if more than one value is used it may be more + // profitable to splat with one value then change some lanes. + // + // Heuristically we decide to do this if the vector has a "dominant" value, + // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; + bool hasDominantValue = false; bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap<SDValue, unsigned> ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -4168,13 +4184,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) isConstant = false; - if (!Value.getNode()) + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; Value = V; - else if (V != Value) - usesOnlyOneValue = false; + } } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; - if (!Value.getNode()) + if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); if (isOnlyLowElement) @@ -4184,9 +4208,34 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. - if (usesOnlyOneValue && EltSize <= 32) { - if (!isConstant) - return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (hasDominantValue && EltSize <= 32) { + if (!isConstant) { + SDValue N; + + // If we are VDUPing a value that comes directly from a vector, that will + // cause an unnecessary move to and from a GPR, where instead we could + // just use VDUPLANE. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + Value->getOperand(0), Value->getOperand(1)); + else + N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector<SDValue, 3> Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3); + } + } + return N; + } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumElts; ++i) @@ -4198,9 +4247,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); - if (Val.getNode()) - return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + if (usesOnlyOneValue) { + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (isConstant && Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + } } // If all elements are constants and the case above didn't get hit, fall back @@ -5418,7 +5469,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, exitMBB->transferSuccessorsAndUpdatePHIs(BB); const TargetRegisterClass *TRC = isThumb2 ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::rGPRRegClass : (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned scratch = MRI.createVirtualRegister(TRC); unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); @@ -5529,7 +5580,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); const TargetRegisterClass *TRC = isThumb2 ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::rGPRRegClass : (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned scratch = MRI.createVirtualRegister(TRC); unsigned scratch2 = MRI.createVirtualRegister(TRC); @@ -7193,6 +7244,154 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); } +static SDValue findMUL_LOHI(SDValue V) { + if (V->getOpcode() == ISD::UMUL_LOHI || + V->getOpcode() == ISD::SMUL_LOHI) + return V; + return SDValue(); +} + +static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb1Only()) return SDValue(); + + // Only perform the checks after legalize when the pattern is available. + if (DCI.isBeforeLegalize()) return SDValue(); + + // Look for multiply add opportunities. + // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where + // each add nodes consumes a value from ISD::UMUL_LOHI and there is + // a glue link from the first add to the second add. + // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by + // a S/UMLAL instruction. + // loAdd UMUL_LOHI + // \ / :lo \ :hi + // \ / \ [no multiline comment] + // ADDC | hiAdd + // \ :glue / / + // \ / / + // ADDE + // + assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); + SDValue AddcOp0 = AddcNode->getOperand(0); + SDValue AddcOp1 = AddcNode->getOperand(1); + + // Check if the two operands are from the same mul_lohi node. + if (AddcOp0.getNode() == AddcOp1.getNode()) + return SDValue(); + + assert(AddcNode->getNumValues() == 2 && + AddcNode->getValueType(0) == MVT::i32 && + AddcNode->getValueType(1) == MVT::Glue && + "Expect ADDC with two result values: i32, glue"); + + // Check that the ADDC adds the low result of the S/UMUL_LOHI. + if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && + AddcOp0->getOpcode() != ISD::SMUL_LOHI && + AddcOp1->getOpcode() != ISD::UMUL_LOHI && + AddcOp1->getOpcode() != ISD::SMUL_LOHI) + return SDValue(); + + // Look for the glued ADDE. + SDNode* AddeNode = AddcNode->getGluedUser(); + if (AddeNode == NULL) + return SDValue(); + + // Make sure it is really an ADDE. + if (AddeNode->getOpcode() != ISD::ADDE) + return SDValue(); + + assert(AddeNode->getNumOperands() == 3 && + AddeNode->getOperand(2).getValueType() == MVT::Glue && + "ADDE node has the wrong inputs"); + + // Check for the triangle shape. + SDValue AddeOp0 = AddeNode->getOperand(0); + SDValue AddeOp1 = AddeNode->getOperand(1); + + // Make sure that the ADDE operands are not coming from the same node. + if (AddeOp0.getNode() == AddeOp1.getNode()) + return SDValue(); + + // Find the MUL_LOHI node walking up ADDE's operands. + bool IsLeftOperandMUL = false; + SDValue MULOp = findMUL_LOHI(AddeOp0); + if (MULOp == SDValue()) + MULOp = findMUL_LOHI(AddeOp1); + else + IsLeftOperandMUL = true; + if (MULOp == SDValue()) + return SDValue(); + + // Figure out the right opcode. + unsigned Opc = MULOp->getOpcode(); + unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; + + // Figure out the high and low input values to the MLAL node. + SDValue* HiMul = &MULOp; + SDValue* HiAdd = NULL; + SDValue* LoMul = NULL; + SDValue* LowAdd = NULL; + + if (IsLeftOperandMUL) + HiAdd = &AddeOp1; + else + HiAdd = &AddeOp0; + + + if (AddcOp0->getOpcode() == Opc) { + LoMul = &AddcOp0; + LowAdd = &AddcOp1; + } + if (AddcOp1->getOpcode() == Opc) { + LoMul = &AddcOp1; + LowAdd = &AddcOp0; + } + + if (LoMul == NULL) + return SDValue(); + + if (LoMul->getNode() != HiMul->getNode()) + return SDValue(); + + // Create the merged node. + SelectionDAG &DAG = DCI.DAG; + + // Build operand list. + SmallVector<SDValue, 8> Ops; + Ops.push_back(LoMul->getOperand(0)); + Ops.push_back(LoMul->getOperand(1)); + Ops.push_back(*LowAdd); + Ops.push_back(*HiAdd); + + SDValue MLALNode = DAG.getNode(FinalOpc, AddcNode->getDebugLoc(), + DAG.getVTList(MVT::i32, MVT::i32), + &Ops[0], Ops.size()); + + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(MLALNode.getNode(), 1); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + SDValue LoMLALResult(MLALNode.getNode(), 0); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + +/// PerformADDCCombine - Target-specific dag combine transform from +/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. +static SDValue PerformADDCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + return AddCombineTo64bitMLAL(N, DCI, Subtarget); + +} + /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted @@ -8764,6 +8963,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 13b83de..2b8f382 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -173,6 +173,9 @@ namespace llvm { VMULLs, // ...signed VMULLu, // ...unsigned + UMLAL, // 64bit Unsigned Accumulate Multiply + SMLAL, // 64bit Signed Accumulate Multiply + // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other // operations, but for ARM some BUILD_VECTORs are legal as-is and their @@ -257,6 +260,11 @@ namespace llvm { virtual const char *getTargetNodeName(unsigned Opcode) const; + virtual bool isSelectSupported(SelectSupportKind Kind) const { + // ARM does not support scalar condition selects on vectors. + return (Kind != ScalarCondVectorVal); + } + /// getSetCCResultType - Return the value type to use for ISD::SETCC. virtual EVT getSetCCResultType(EVT VT) const; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 992aba5..e23989e 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -83,6 +83,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, SDTCisInt<0>, SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; + +def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >; +def ARMUmlal : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>; +def ARMSmlal : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperDYN : SDNode<"ARMISD::WrapperDYN", SDTIntUnaryOp>; @@ -90,9 +97,10 @@ def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntBinOp>; def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, - [SDNPHasChain, SDNPOutGlue]>; + [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + [SDNPHasChain, SDNPSideEffect, + SDNPOptInGlue, SDNPOutGlue]>; def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" , SDT_ARMStructByVal, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, @@ -148,14 +156,16 @@ def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>; def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", - SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>; + SDT_ARMEH_SJLJ_Setjmp, + [SDNPHasChain, SDNPSideEffect]>; def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", - SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>; + SDT_ARMEH_SJLJ_Longjmp, + [SDNPHasChain, SDNPSideEffect]>; def ARMMemBarrier : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER, - [SDNPHasChain]>; + [SDNPHasChain, SDNPSideEffect]>; def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, - [SDNPHasChain]>; + [SDNPHasChain, SDNPSideEffect]>; def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; @@ -275,7 +285,7 @@ def imm16_31 : ImmLeaf<i32, [{ def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; } def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{ - int64_t Value = -(int)N->getZExtValue(); + unsigned Value = -(unsigned)N->getZExtValue(); return Value && ARM_AM::getSOImmVal(Value) != -1; }], imm_neg_XFORM> { let ParserMatchClass = so_imm_neg_asmoperand; @@ -1791,12 +1801,15 @@ def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), let Inst{15-12} = Rd; let Inst{11-0} = label{11-0}; } + +let hasSideEffects = 1 in { def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), 4, IIC_iALUi, []>; def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, nohash_imm:$id, pred:$p), 4, IIC_iALUi, []>; +} //===----------------------------------------------------------------------===// // Control Flow Instructions. @@ -3399,6 +3412,18 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, let Inst{11-8} = Rm; let Inst{3-0} = Rn; } +class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} // FIXME: The v5 pseudos are only necessary for the additional Constraint // property. Remove them when it's possible to add those properties @@ -3481,14 +3506,14 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), } // Multiply + accumulate -def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, +def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]>; -def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; +def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]>; + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, @@ -3504,17 +3529,22 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), let Inst{3-0} = Rn; } -let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in { def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), 4, IIC_iMAC64, [], - (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, + pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>; def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), 4, IIC_iMAC64, [], - (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, + pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>; +} + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), (ins GPR:$Rn, GPR:$Rm, pred:$p), 4, IIC_iMAC64, [], @@ -3986,48 +4016,6 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd), [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>, RegConstraint<"$false = $Rd">; -// Conditional instructions -multiclass AsI1_bincc_irs<Instruction iri, Instruction irr, Instruction irsi, - Instruction irsr, - InstrItinClass iii, InstrItinClass iir, - InstrItinClass iis> { - def ri : ARMPseudoExpand<(outs GPR:$Rd), - (ins GPR:$Rfalse, GPR:$Rn, so_imm:$imm, - pred:$p, cc_out:$s), - 4, iii, [], - (iri GPR:$Rd, GPR:$Rn, so_imm:$imm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - def rr : ARMPseudoExpand<(outs GPR:$Rd), - (ins GPR:$Rfalse, GPR:$Rn, GPR:$Rm, - pred:$p, cc_out:$s), - 4, iir, [], - (irr GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - def rsi : ARMPseudoExpand<(outs GPR:$Rd), - (ins GPR:$Rfalse, GPR:$Rn, so_reg_imm:$shift, - pred:$p, cc_out:$s), - 4, iis, [], - (irsi GPR:$Rd, GPR:$Rn, so_reg_imm:$shift, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - def rsr : ARMPseudoExpand<(outs GPRnopc:$Rd), - (ins GPRnopc:$Rfalse, GPRnopc:$Rn, so_reg_reg:$shift, - pred:$p, cc_out:$s), - 4, iis, [], - (irsr GPR:$Rd, GPR:$Rn, so_reg_reg:$shift, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; -} - -defm ANDCC : AsI1_bincc_irs<ANDri, ANDrr, ANDrsi, ANDrsr, - IIC_iBITi, IIC_iBITr, IIC_iBITsr>; -defm ORRCC : AsI1_bincc_irs<ORRri, ORRrr, ORRrsi, ORRrsr, - IIC_iBITi, IIC_iBITr, IIC_iBITsr>; -defm EORCC : AsI1_bincc_irs<EORri, EORrr, EORrsi, EORrsr, - IIC_iBITi, IIC_iBITr, IIC_iBITsr>; -defm ADDCC : AsI1_bincc_irs<ADDri, ADDrr, ADDrsi, ADDrsr, - IIC_iBITi, IIC_iBITr, IIC_iBITsr>; -defm SUBCC : AsI1_bincc_irs<SUBri, SUBrr, SUBrsi, SUBrsr, - IIC_iBITi, IIC_iBITr, IIC_iBITsr>; - } // neverHasSideEffects diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 048d340..8158a11 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -1980,7 +1980,7 @@ def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8, def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, NEONvgetlaneu, addrmode6> { let Inst{7-6} = lane{1-0}; - let Inst{4} = Rn{5}; + let Inst{4} = Rn{4}; } def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, @@ -2023,7 +2023,7 @@ def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8, def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16, NEONvgetlaneu, addrmode6> { let Inst{7-6} = lane{1-0}; - let Inst{4} = Rn{5}; + let Inst{4} = Rn{4}; } def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store, extractelt, addrmode6oneL32> { @@ -5045,25 +5045,23 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), GPR:$R, imm:$lane))]> { let Inst{21} = lane{0}; } + +def VSETLNi8Q : PseudoNeonI<(outs QPR:$V), + (ins QPR:$src1, GPR:$R, VectorIndex8:$lane), + IIC_VMOVISL, "", + [(set QPR:$V, (vector_insert (v16i8 QPR:$src1), + GPR:$R, imm:$lane))]>; +def VSETLNi16Q : PseudoNeonI<(outs QPR:$V), + (ins QPR:$src1, GPR:$R, VectorIndex16:$lane), + IIC_VMOVISL, "", + [(set QPR:$V, (vector_insert (v8i16 QPR:$src1), + GPR:$R, imm:$lane))]>; } -def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), - (v16i8 (INSERT_SUBREG QPR:$src1, - (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i8_reg imm:$lane))), - GPR:$src2, (SubReg_i8_lane imm:$lane))), - (DSubReg_i8_reg imm:$lane)))>; -def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), - (v8i16 (INSERT_SUBREG QPR:$src1, - (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i16_reg imm:$lane))), - GPR:$src2, (SubReg_i16_lane imm:$lane))), - (DSubReg_i16_reg imm:$lane)))>; + def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), - (v4i32 (INSERT_SUBREG QPR:$src1, - (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i32_reg imm:$lane))), - GPR:$src2, (SubReg_i32_lane imm:$lane))), - (DSubReg_i32_reg imm:$lane)))>; + (v4i32 (INSERT_SUBREG QPR:$src1, + GPR:$src2, + (SSubReg_f32_reg imm:$lane)))>; def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)), (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)), diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 554f6d9..e171f8b 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -1200,6 +1200,7 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p), 2, IIC_iALUi, []>; +let hasSideEffects = 1 in def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, nohash_imm:$id, pred:$p), 2, IIC_iALUi, []>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 8ecf009..f1a6cce 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -523,6 +523,23 @@ class T2MulLong<bits<3> opc22_20, bits<4> opc7_4, let Inst{7-4} = opc7_4; let Inst{3-0} = Rm; } +class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-23} = 0b111110111; + let Inst{22-20} = opc22_20; + let Inst{19-16} = Rn; + let Inst{15-12} = RdLo; + let Inst{11-8} = RdHi; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a @@ -757,33 +774,6 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode, let Inst{24} = 1; let Inst{23-21} = op23_21; } - - // Predicated versions. - def CCri : t2PseudoExpand<(outs GPRnopc:$Rd), - (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_imm:$imm, - pred:$p, cc_out:$s), 4, IIC_iALUi, [], - (!cast<Instruction>(NAME#ri) GPRnopc:$Rd, - GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - def CCri12 : t2PseudoExpand<(outs GPRnopc:$Rd), - (ins GPRnopc:$Rfalse, GPR:$Rn, imm0_4095:$imm, - pred:$p), - 4, IIC_iALUi, [], - (!cast<Instruction>(NAME#ri12) GPRnopc:$Rd, - GPR:$Rn, imm0_4095:$imm, pred:$p)>, - RegConstraint<"$Rfalse = $Rd">; - def CCrr : t2PseudoExpand<(outs GPRnopc:$Rd), - (ins GPRnopc:$Rfalse, GPRnopc:$Rn, rGPR:$Rm, - pred:$p, cc_out:$s), 4, IIC_iALUr, [], - (!cast<Instruction>(NAME#rr) GPRnopc:$Rd, - GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - def CCrs : t2PseudoExpand<(outs GPRnopc:$Rd), - (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_reg:$Rm, - pred:$p, cc_out:$s), 4, IIC_iALUsi, [], - (!cast<Instruction>(NAME#rs) GPRnopc:$Rd, - GPRnopc:$Rn, t2_so_reg:$Rm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; } /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns @@ -1200,6 +1190,7 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd), let neverHasSideEffects = 1, isReMaterializable = 1 in def t2LEApcrel : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p), 4, IIC_iALUi, []>; +let hasSideEffects = 1 in def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, nohash_imm:$id, pred:$p), 4, IIC_iALUi, @@ -2437,15 +2428,17 @@ def t2UMULL : T2MulLong<0b010, 0b0000, } // isCommutable // Multiply + accumulate -def t2SMLAL : T2MulLong<0b100, 0b0000, +def t2SMLAL : T2MlaLong<0b100, 0b0000, (outs rGPR:$RdLo, rGPR:$RdHi), - (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, - "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">; -def t2UMLAL : T2MulLong<0b110, 0b0000, +def t2UMLAL : T2MlaLong<0b110, 0b0000, (outs rGPR:$RdLo, rGPR:$RdHi), - (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, - "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">; def t2UMAAL : T2MulLong<0b110, 0b0110, (outs rGPR:$RdLo, rGPR:$RdHi), @@ -3049,37 +3042,6 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd), RegConstraint<"$false = $Rd">; } // isCodeGenOnly = 1 -multiclass T2I_bincc_irs<Instruction iri, Instruction irr, Instruction irs, - InstrItinClass iii, InstrItinClass iir, InstrItinClass iis> { - // shifted imm - def ri : t2PseudoExpand<(outs rGPR:$Rd), - (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_imm:$imm, - pred:$p, cc_out:$s), - 4, iii, [], - (iri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - // register - def rr : t2PseudoExpand<(outs rGPR:$Rd), - (ins rGPR:$Rfalse, rGPR:$Rn, rGPR:$Rm, - pred:$p, cc_out:$s), - 4, iir, [], - (irr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; - // shifted register - def rs : t2PseudoExpand<(outs rGPR:$Rd), - (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_reg:$ShiftedRm, - pred:$p, cc_out:$s), - 4, iis, [], - (irs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>, - RegConstraint<"$Rfalse = $Rd">; -} // T2I_bincc_irs - -defm t2ANDCC : T2I_bincc_irs<t2ANDri, t2ANDrr, t2ANDrs, - IIC_iBITi, IIC_iBITr, IIC_iBITsi>; -defm t2ORRCC : T2I_bincc_irs<t2ORRri, t2ORRrr, t2ORRrs, - IIC_iBITi, IIC_iBITr, IIC_iBITsi>; -defm t2EORCC : T2I_bincc_irs<t2EORri, t2EORrr, t2EORrs, - IIC_iBITi, IIC_iBITr, IIC_iBITsi>; } // neverHasSideEffects //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp index a812e21..2dc49a9 100644 --- a/lib/Target/ARM/ARMJITInfo.cpp +++ b/lib/Target/ARM/ARMJITInfo.cpp @@ -169,7 +169,7 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn, intptr_t LazyPtr = getIndirectSymAddr(Fn); if (!LazyPtr) { // In PIC mode, the function stub is loading a lazy-ptr. - LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE); + LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE); DEBUG(if (F) errs() << "JIT: Indirect symbol emitted at [" << LazyPtr << "] for GV '" << F->getName() << "'\n"; diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 3a5957b..e1e2f6e 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -181,49 +181,44 @@ class ARMAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index); // Asm Match Converter Methods - bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode, - const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode, - const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, + void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); + void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); + void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, + void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + void cvtLdWriteBackRegAddrMode2(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, + void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, + void cvtStWriteBackRegAddrModeImm12(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + void cvtStWriteBackRegAddrMode2(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + void cvtStWriteBackRegAddrMode3(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode, + void cvtLdExtTWriteBackImm(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode, + void cvtLdExtTWriteBackReg(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode, + void cvtStExtTWriteBackImm(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode, + void cvtStExtTWriteBackReg(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdrdPre(MCInst &Inst, unsigned Opcode, - const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtStrdPre(MCInst &Inst, unsigned Opcode, - const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); + void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); + void cvtLdWriteBackRegAddrMode3(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode, + void cvtThumbMultiply(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtVLDwbFixed(MCInst &Inst, unsigned Opcode, + void cvtVLDwbFixed(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtVLDwbRegister(MCInst &Inst, unsigned Opcode, + void cvtVLDwbRegister(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtVSTwbFixed(MCInst &Inst, unsigned Opcode, + void cvtVSTwbFixed(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool cvtVSTwbRegister(MCInst &Inst, unsigned Opcode, + void cvtVSTwbRegister(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &); - bool validateInstruction(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Ops); bool processInstruction(MCInst &Inst, @@ -267,6 +262,12 @@ public: bool MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out); + + unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + unsigned OperandNum, unsigned &NumMCOperands) { + return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, NumMCOperands); + } }; } // end anonymous namespace @@ -3880,8 +3881,8 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// cvtT2LdrdPre - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtT2LdrdPre(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Rt, Rt2 ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -3892,14 +3893,13 @@ cvtT2LdrdPre(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtT2StrdPre - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtT2StrdPre(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateReg(0)); @@ -3910,14 +3910,13 @@ cvtT2StrdPre(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -3926,28 +3925,26 @@ cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdWriteBackRegAddrMode2(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -3956,14 +3953,13 @@ cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -3972,57 +3968,53 @@ cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStWriteBackRegAddrModeImm12(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStWriteBackRegAddrMode2(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStWriteBackRegAddrMode3(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdExtTWriteBackImm(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Rt ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -4034,14 +4026,13 @@ cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdExtTWriteBackReg(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Rt ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -4053,14 +4044,13 @@ cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStExtTWriteBackImm - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStExtTWriteBackImm(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); @@ -4072,14 +4062,13 @@ cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStExtTWriteBackReg - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStExtTWriteBackReg(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); @@ -4091,14 +4080,13 @@ cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdrdPre - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdrdPre(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Rt, Rt2 ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); @@ -4109,14 +4097,13 @@ cvtLdrdPre(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtStrdPre - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtStrdPre(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); @@ -4127,40 +4114,27 @@ cvtStrdPre(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtLdWriteBackRegAddrMode3(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3); ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } -/// cvtThumbMultiple- Convert parsed operands to MCInst. +/// cvtThumbMultiply - Convert parsed operands to MCInst. /// Needed here because the Asm Gen Matcher can't handle properly tied operands /// when they refer multiple MIOperands inside a single one. -bool ARMAsmParser:: -cvtThumbMultiply(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtThumbMultiply(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { - // The second source operand must be the same register as the destination - // operand. - if (Operands.size() == 6 && - (((ARMOperand*)Operands[3])->getReg() != - ((ARMOperand*)Operands[5])->getReg()) && - (((ARMOperand*)Operands[3])->getReg() != - ((ARMOperand*)Operands[4])->getReg())) { - Error(Operands[3]->getStartLoc(), - "destination register must match source register"); - return false; - } ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1); // If we have a three-operand form, make sure to set Rn to be the operand @@ -4173,12 +4147,10 @@ cvtThumbMultiply(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1); Inst.addOperand(Inst.getOperand(0)); ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2); - - return true; } -bool ARMAsmParser:: -cvtVLDwbFixed(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtVLDwbFixed(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Vd ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1); @@ -4188,11 +4160,10 @@ cvtVLDwbFixed(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } -bool ARMAsmParser:: -cvtVLDwbRegister(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtVLDwbRegister(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Vd ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1); @@ -4204,11 +4175,10 @@ cvtVLDwbRegister(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } -bool ARMAsmParser:: -cvtVSTwbFixed(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtVSTwbFixed(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); @@ -4218,11 +4188,10 @@ cvtVSTwbFixed(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } -bool ARMAsmParser:: -cvtVSTwbRegister(MCInst &Inst, unsigned Opcode, +void ARMAsmParser:: +cvtVSTwbRegister(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // Create a writeback register dummy placeholder. Inst.addOperand(MCOperand::CreateImm(0)); @@ -4234,7 +4203,6 @@ cvtVSTwbRegister(MCInst &Inst, unsigned Opcode, ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1); // pred ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); - return true; } /// Parse an ARM memory expression, return false if successful else return true @@ -5377,6 +5345,25 @@ validateInstruction(MCInst &Inst, "in register list"); break; } + case ARM::tMUL: { + // The second source operand must be the same register as the destination + // operand. + // + // In this case, we must directly check the parsed operands because the + // cvtThumbMultiply() function is written in such a way that it guarantees + // this first statement is always true for the new Inst. Essentially, the + // destination is unconditionally copied into the second source operand + // without checking to see if it matches what we actually parsed. + if (Operands.size() == 6 && + (((ARMOperand*)Operands[3])->getReg() != + ((ARMOperand*)Operands[5])->getReg()) && + (((ARMOperand*)Operands[3])->getReg() != + ((ARMOperand*)Operands[4])->getReg())) { + return Error(Operands[3]->getStartLoc(), + "destination register must match source register"); + } + break; + } // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2, // so only issue a diagnostic for thumb1. The instructions will be // switched to the t2 encodings in processInstruction() if necessary. @@ -7475,9 +7462,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out) { MCInst Inst; + unsigned Kind; unsigned ErrorInfo; unsigned MatchResult; - MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo); + + MatchResult = MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo); switch (MatchResult) { default: break; case Match_Success: @@ -7540,9 +7529,6 @@ MatchAndEmitInstruction(SMLoc IDLoc, case Match_MnemonicFail: return Error(IDLoc, "invalid instruction", ((ARMOperand*)Operands[0])->getLocRange()); - case Match_ConversionFail: - // The converter function will have already emitted a diagnostic. - return true; case Match_RequiresNotITBlock: return Error(IDLoc, "flag setting instruction only valid outside IT block"); case Match_RequiresITBlock: diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index c90751d..57642e1 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -2701,6 +2701,8 @@ static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn, unsigned align = fieldFromInstruction(Insn, 4, 1); unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 0 && align == 1) + return MCDisassembler::Fail; align *= (1 << size); switch (Inst.getOpcode()) { @@ -2831,6 +2833,8 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn, unsigned align = fieldFromInstruction(Insn, 4, 1); if (size == 0x3) { + if (align == 0) + return MCDisassembler::Fail; size = 4; align = 16; } else { @@ -3170,7 +3174,7 @@ static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, int imm = Val & 0xFF; if (!(Val & 0x100)) imm *= -1; - Inst.addOperand(MCOperand::CreateImm(imm << 2)); + Inst.addOperand(MCOperand::CreateImm(imm * 4)); } return MCDisassembler::Success; @@ -3710,8 +3714,16 @@ static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, if (fieldFromInstruction(Insn, 6, 1)) return MCDisassembler::Fail; // UNDEFINED index = fieldFromInstruction(Insn, 7, 1); - if (fieldFromInstruction(Insn, 4, 2) != 0) - align = 4; + + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0 : + align = 0; break; + case 3: + align = 4; break; + default: + return MCDisassembler::Fail; + } + break; } if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) @@ -3769,8 +3781,16 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, if (fieldFromInstruction(Insn, 6, 1)) return MCDisassembler::Fail; // UNDEFINED index = fieldFromInstruction(Insn, 7, 1); - if (fieldFromInstruction(Insn, 4, 2) != 0) - align = 4; + + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0: + align = 0; break; + case 3: + align = 4; break; + default: + return MCDisassembler::Fail; + } + break; } if (Rm != 0xF) { // Writeback @@ -4090,8 +4110,15 @@ static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, inc = 2; break; case 2: - if (fieldFromInstruction(Insn, 4, 2)) - align = 4 << fieldFromInstruction(Insn, 4, 2); + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0: + align = 0; break; + case 3: + return MCDisassembler::Fail; + default: + align = 4 << fieldFromInstruction(Insn, 4, 2); break; + } + index = fieldFromInstruction(Insn, 7, 1); if (fieldFromInstruction(Insn, 6, 1)) inc = 2; @@ -4164,8 +4191,15 @@ static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, inc = 2; break; case 2: - if (fieldFromInstruction(Insn, 4, 2)) - align = 4 << fieldFromInstruction(Insn, 4, 2); + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0: + align = 0; break; + case 3: + return MCDisassembler::Fail; + default: + align = 4 << fieldFromInstruction(Insn, 4, 2); break; + } + index = fieldFromInstruction(Insn, 7, 1); if (fieldFromInstruction(Insn, 6, 1)) inc = 2; diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 7d6acbc..b53da3b 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -194,6 +194,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARM::fixup_arm_uncondbranch: Type = ELF::R_ARM_JUMP24; break; + case ARM::fixup_t2_condbranch: + case ARM::fixup_t2_uncondbranch: + Type = ELF::R_ARM_THM_JUMP24; + break; case ARM::fixup_arm_movt_hi16: case ARM::fixup_arm_movt_hi16_pcrel: Type = ELF::R_ARM_MOVT_PREL; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index d32805e..c1aab9c 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -50,7 +50,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() { Code32Directive = ".code\t32"; WeakRefDirective = "\t.weak\t"; - LCOMMDirectiveType = LCOMM::NoAlignment; HasLEB128 = true; SupportsDebugInformation = true; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 94f1082..1917564 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -783,7 +783,7 @@ getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, // Immediate is always encoded as positive. The 'U' bit controls add vs sub. if (Imm8 < 0) - Imm8 = -Imm8; + Imm8 = -(uint32_t)Imm8; // Scaled by 4. Imm8 /= 4; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index a51e0fa..95640f7 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -410,7 +410,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer, if (Type == macho::RIT_ARM_Half) { // The other-half value only gets populated for the movt and movw // relocation entries. - uint32_t Value = 0;; + uint32_t Value = 0; switch ((unsigned)Fixup.getKind()) { default: break; case ARM::fixup_arm_movw_lo16: diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp index 03d5a9a..3396e8b 100644 --- a/lib/Target/CellSPU/SPUAsmPrinter.cpp +++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp @@ -130,8 +130,7 @@ namespace { void printS10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { - short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16) - >> 16); + short value = MI->getOperand(OpNo).getImm(); assert((value >= -(1 << 9) && value <= (1 << 9) - 1) && "Invalid s10 argument"); O << value; @@ -140,8 +139,7 @@ namespace { void printU10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { - short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16) - >> 16); + short value = MI->getOperand(OpNo).getImm(); assert((value <= (1 << 10) - 1) && "Invalid u10 argument"); O << value; } diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index c27caea..425371d 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -83,12 +83,10 @@ namespace { return true; } else if (vt == MVT::i32) { int32_t i_val = (int32_t) CN->getZExtValue(); - short s_val = (short) i_val; - return i_val == s_val; + return i_val == SignExtend32<16>(i_val); } else { int64_t i_val = (int64_t) CN->getZExtValue(); - short s_val = (short) i_val; - return i_val == s_val; + return i_val == SignExtend64<16>(i_val); } } @@ -99,9 +97,10 @@ namespace { EVT vt = FPN->getValueType(0); if (vt == MVT::f32) { int val = FloatToBits(FPN->getValueAPF().convertToFloat()); - int sval = (int) ((val << 16) >> 16); - Imm = (short) val; - return val == sval; + if (val == SignExtend32<16>(val)) { + Imm = (short) val; + return true; + } } return false; diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt index 1f2d8ac..306084b 100644 --- a/lib/Target/Hexagon/CMakeLists.txt +++ b/lib/Target/Hexagon/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_target(HexagonCodeGen HexagonExpandPredSpillCode.cpp HexagonFrameLowering.cpp HexagonHardwareLoops.cpp + HexagonMachineScheduler.cpp HexagonMCInstLower.cpp HexagonInstrInfo.cpp HexagonISelDAGToDAG.cpp diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp new file mode 100644 index 0000000..b131a8f --- /dev/null +++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -0,0 +1,952 @@ +//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MachineScheduler schedules machine instructions after phi elimination. It +// preserves LiveIntervals so it can be invoked before register allocation. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "misched" + +#include "HexagonMachineScheduler.h" + +#include <queue> + +using namespace llvm; + +static cl::opt<bool> ForceTopDown("vliw-misched-topdown", cl::Hidden, + cl::desc("Force top-down list scheduling")); +static cl::opt<bool> ForceBottomUp("vliw-misched-bottomup", cl::Hidden, + cl::desc("Force bottom-up list scheduling")); + +#ifndef NDEBUG +static cl::opt<bool> ViewMISchedDAGs("vliw-view-misched-dags", cl::Hidden, + cl::desc("Pop up a window to show MISched dags after they are processed")); + +static cl::opt<unsigned> MISchedCutoff("vliw-misched-cutoff", cl::Hidden, + cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); +#else +static bool ViewMISchedDAGs = false; +#endif // NDEBUG + +/// Decrement this iterator until reaching the top or a non-debug instr. +static MachineBasicBlock::iterator +priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) { + assert(I != Beg && "reached the top of the region, cannot decrement"); + while (--I != Beg) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// If this iterator is a debug value, increment until reaching the End or a +/// non-debug instruction. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) { + for(; I != End; ++I) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When +/// NumPredsLeft reaches zero, release the successor node. +/// +/// FIXME: Adjust SuccSU height based on MinLatency. +void VLIWMachineScheduler::releaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + +#ifndef NDEBUG + if (SuccSU->NumPredsLeft == 0) { + dbgs() << "*** Scheduling failed! ***\n"; + SuccSU->dump(this); + dbgs() << " has been released too many times!\n"; + llvm_unreachable(0); + } +#endif + --SuccSU->NumPredsLeft; + if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) + SchedImpl->releaseTopNode(SuccSU); +} + +/// releaseSuccessors - Call releaseSucc on each of SU's successors. +void VLIWMachineScheduler::releaseSuccessors(SUnit *SU) { + for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) { + releaseSucc(SU, &*I); + } +} + +/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When +/// NumSuccsLeft reaches zero, release the predecessor node. +/// +/// FIXME: Adjust PredSU height based on MinLatency. +void VLIWMachineScheduler::releasePred(SUnit *SU, SDep *PredEdge) { + SUnit *PredSU = PredEdge->getSUnit(); + +#ifndef NDEBUG + if (PredSU->NumSuccsLeft == 0) { + dbgs() << "*** Scheduling failed! ***\n"; + PredSU->dump(this); + dbgs() << " has been released too many times!\n"; + llvm_unreachable(0); + } +#endif + --PredSU->NumSuccsLeft; + if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) + SchedImpl->releaseBottomNode(PredSU); +} + +/// releasePredecessors - Call releasePred on each of SU's predecessors. +void VLIWMachineScheduler::releasePredecessors(SUnit *SU) { + for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + releasePred(SU, &*I); + } +} + +void VLIWMachineScheduler::moveInstruction(MachineInstr *MI, + MachineBasicBlock::iterator InsertPos) { + // Advance RegionBegin if the first instruction moves down. + if (&*RegionBegin == MI) + ++RegionBegin; + + // Update the instruction stream. + BB->splice(InsertPos, BB, MI); + + // Update LiveIntervals + LIS->handleMove(MI); + + // Recede RegionBegin if an instruction moves above the first. + if (RegionBegin == InsertPos) + RegionBegin = MI; +} + +bool VLIWMachineScheduler::checkSchedLimit() { +#ifndef NDEBUG + if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { + CurrentTop = CurrentBottom; + return false; + } + ++NumInstrsScheduled; +#endif + return true; +} + +/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after +/// crossing a scheduling boundary. [begin, end) includes all instructions in +/// the region, including the boundary itself and single-instruction regions +/// that don't get scheduled. +void VLIWMachineScheduler::enterRegion(MachineBasicBlock *bb, + MachineBasicBlock::iterator begin, + MachineBasicBlock::iterator end, + unsigned endcount) +{ + ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount); + + // For convenience remember the end of the liveness region. + LiveRegionEnd = + (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd); +} + +// Setup the register pressure trackers for the top scheduled top and bottom +// scheduled regions. +void VLIWMachineScheduler::initRegPressure() { + TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); + + // Close the RPTracker to finalize live ins. + RPTracker.closeRegion(); + + DEBUG(RPTracker.getPressure().dump(TRI)); + + // Initialize the live ins and live outs. + TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); + BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); + + // Close one end of the tracker so we can call + // getMaxUpward/DownwardPressureDelta before advancing across any + // instructions. This converts currently live regs into live ins/outs. + TopRPTracker.closeTop(); + BotRPTracker.closeBottom(); + + // Account for liveness generated by the region boundary. + if (LiveRegionEnd != RegionEnd) + BotRPTracker.recede(); + + assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); + + // Cache the list of excess pressure sets in this region. This will also track + // the max pressure in the scheduled code for these sets. + RegionCriticalPSets.clear(); + std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure; + for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { + unsigned Limit = TRI->getRegPressureSetLimit(i); + DEBUG(dbgs() << TRI->getRegPressureSetName(i) + << "Limit " << Limit + << " Actual " << RegionPressure[i] << "\n"); + if (RegionPressure[i] > Limit) + RegionCriticalPSets.push_back(PressureElement(i, 0)); + } + DEBUG(dbgs() << "Excess PSets: "; + for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) + dbgs() << TRI->getRegPressureSetName( + RegionCriticalPSets[i].PSetID) << " "; + dbgs() << "\n"); + + TotalPackets = 0; +} + +// FIXME: When the pressure tracker deals in pressure differences then we won't +// iterate over all RegionCriticalPSets[i]. +void VLIWMachineScheduler:: +updateScheduledPressure(std::vector<unsigned> NewMaxPressure) { + for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) { + unsigned ID = RegionCriticalPSets[i].PSetID; + int &MaxUnits = RegionCriticalPSets[i].UnitIncrease; + if ((int)NewMaxPressure[ID] > MaxUnits) + MaxUnits = NewMaxPressure[ID]; + } +} + +/// Check if scheduling of this SU is possible +/// in the current packet. +/// It is _not_ precise (statefull), it is more like +/// another heuristic. Many corner cases are figured +/// empirically. +bool VLIWResourceModel::isResourceAvailable(SUnit *SU) { + if (!SU || !SU->getInstr()) + return false; + + // First see if the pipeline could receive this instruction + // in the current cycle. + switch (SU->getInstr()->getOpcode()) { + default: + if (!ResourcesModel->canReserveResources(SU->getInstr())) + return false; + case TargetOpcode::EXTRACT_SUBREG: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::COPY: + case TargetOpcode::INLINEASM: + break; + } + + // Now see if there are no other dependencies to instructions already + // in the packet. + for (unsigned i = 0, e = Packet.size(); i != e; ++i) { + if (Packet[i]->Succs.size() == 0) + continue; + for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(), + E = Packet[i]->Succs.end(); I != E; ++I) { + // Since we do not add pseudos to packets, might as well + // ignore order dependencies. + if (I->isCtrl()) + continue; + + if (I->getSUnit() == SU) + return false; + } + } + return true; +} + +/// Keep track of available resources. +bool VLIWResourceModel::reserveResources(SUnit *SU) { + bool startNewCycle = false; + // If this SU does not fit in the packet + // start a new one. + if (!isResourceAvailable(SU)) { + ResourcesModel->clearResources(); + Packet.clear(); + TotalPackets++; + startNewCycle = true; + } + + switch (SU->getInstr()->getOpcode()) { + default: + ResourcesModel->reserveResources(SU->getInstr()); + break; + case TargetOpcode::EXTRACT_SUBREG: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::PROLOG_LABEL: + case TargetOpcode::EH_LABEL: + case TargetOpcode::COPY: + case TargetOpcode::INLINEASM: + break; + } + Packet.push_back(SU); + +#ifndef NDEBUG + DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n"); + for (unsigned i = 0, e = Packet.size(); i != e; ++i) { + DEBUG(dbgs() << "\t[" << i << "] SU("); + DEBUG(dbgs() << Packet[i]->NodeNum << ")\t"); + DEBUG(Packet[i]->getInstr()->dump()); + } +#endif + + // If packet is now full, reset the state so in the next cycle + // we start fresh. + if (Packet.size() >= InstrItins->SchedModel->IssueWidth) { + ResourcesModel->clearResources(); + Packet.clear(); + TotalPackets++; + startNewCycle = true; + } + + return startNewCycle; +} + +// Release all DAG roots for scheduling. +void VLIWMachineScheduler::releaseRoots() { + SmallVector<SUnit*, 16> BotRoots; + + for (std::vector<SUnit>::iterator + I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { + // A SUnit is ready to top schedule if it has no predecessors. + if (I->Preds.empty()) + SchedImpl->releaseTopNode(&(*I)); + // A SUnit is ready to bottom schedule if it has no successors. + if (I->Succs.empty()) + BotRoots.push_back(&(*I)); + } + // Release bottom roots in reverse order so the higher priority nodes appear + // first. This is more natural and slightly more efficient. + for (SmallVectorImpl<SUnit*>::const_reverse_iterator + I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) + SchedImpl->releaseBottomNode(*I); +} + +/// schedule - Called back from MachineScheduler::runOnMachineFunction +/// after setting up the current scheduling region. [RegionBegin, RegionEnd) +/// only includes instructions that have DAG nodes, not scheduling boundaries. +void VLIWMachineScheduler::schedule() { + DEBUG(dbgs() + << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber() + << " " << BB->getName() + << " in_func " << BB->getParent()->getFunction()->getName() + << " at loop depth " << MLI->getLoopDepth(BB) + << " \n"); + + // Initialize the register pressure tracker used by buildSchedGraph. + RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); + + // Account for liveness generate by the region boundary. + if (LiveRegionEnd != RegionEnd) + RPTracker.recede(); + + // Build the DAG, and compute current register pressure. + buildSchedGraph(AA, &RPTracker); + + // Initialize top/bottom trackers after computing region pressure. + initRegPressure(); + + // To view Height/Depth correctly, they should be accessed at least once. + DEBUG(unsigned maxH = 0; + for (unsigned su = 0, e = SUnits.size(); su != e; ++su) + if (SUnits[su].getHeight() > maxH) + maxH = SUnits[su].getHeight(); + dbgs() << "Max Height " << maxH << "\n";); + DEBUG(unsigned maxD = 0; + for (unsigned su = 0, e = SUnits.size(); su != e; ++su) + if (SUnits[su].getDepth() > maxD) + maxD = SUnits[su].getDepth(); + dbgs() << "Max Depth " << maxD << "\n";); + DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) + SUnits[su].dumpAll(this)); + + if (ViewMISchedDAGs) viewGraph(); + + SchedImpl->initialize(this); + + // Release edges from the special Entry node or to the special Exit node. + releaseSuccessors(&EntrySU); + releasePredecessors(&ExitSU); + + // Release all DAG roots for scheduling. + releaseRoots(); + + CurrentTop = nextIfDebug(RegionBegin, RegionEnd); + CurrentBottom = RegionEnd; + bool IsTopNode = false; + while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { + if (!checkSchedLimit()) + break; + + // Move the instruction to its new location in the instruction stream. + MachineInstr *MI = SU->getInstr(); + + if (IsTopNode) { + assert(SU->isTopReady() && "node still has unscheduled dependencies"); + if (&*CurrentTop == MI) + CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); + else { + moveInstruction(MI, CurrentTop); + TopRPTracker.setPos(MI); + } + + // Update top scheduled pressure. + TopRPTracker.advance(); + assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); + updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure); + + // Release dependent instructions for scheduling. + releaseSuccessors(SU); + } else { + assert(SU->isBottomReady() && "node still has unscheduled dependencies"); + MachineBasicBlock::iterator priorII = + priorNonDebug(CurrentBottom, CurrentTop); + if (&*priorII == MI) + CurrentBottom = priorII; + else { + if (&*CurrentTop == MI) { + CurrentTop = nextIfDebug(++CurrentTop, priorII); + TopRPTracker.setPos(CurrentTop); + } + moveInstruction(MI, CurrentBottom); + CurrentBottom = MI; + } + // Update bottom scheduled pressure. + BotRPTracker.recede(); + assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); + updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure); + + // Release dependent instructions for scheduling. + releasePredecessors(SU); + } + SU->isScheduled = true; + SchedImpl->schedNode(SU, IsTopNode); + } + assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); +} + +/// Reinsert any remaining debug_values, just like the PostRA scheduler. +void VLIWMachineScheduler::placeDebugValues() { + // If first instruction was a DBG_VALUE then put it back. + if (FirstDbgValue) { + BB->splice(RegionBegin, BB, FirstDbgValue); + RegionBegin = FirstDbgValue; + } + + for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator + DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { + std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); + MachineInstr *DbgValue = P.first; + MachineBasicBlock::iterator OrigPrevMI = P.second; + BB->splice(++OrigPrevMI, BB, DbgValue); + if (OrigPrevMI == llvm::prior(RegionEnd)) + RegionEnd = DbgValue; + } + DbgValues.clear(); + FirstDbgValue = NULL; +} + +void ConvergingVLIWScheduler::initialize(VLIWMachineScheduler *dag) { + DAG = dag; + TRI = DAG->TRI; + Top.DAG = dag; + Bot.DAG = dag; + + // Initialize the HazardRecognizers. + const TargetMachine &TM = DAG->MF.getTarget(); + const InstrItineraryData *Itin = TM.getInstrItineraryData(); + Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); + Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); + + Top.ResourceModel = new VLIWResourceModel(TM); + Bot.ResourceModel = new VLIWResourceModel(TM); + + assert((!ForceTopDown || !ForceBottomUp) && + "-misched-topdown incompatible with -misched-bottomup"); +} + +void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) { + if (SU->isScheduled) + return; + + for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; + unsigned MinLatency = I->getMinLatency(); +#ifndef NDEBUG + Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency); +#endif + if (SU->TopReadyCycle < PredReadyCycle + MinLatency) + SU->TopReadyCycle = PredReadyCycle + MinLatency; + } + Top.releaseNode(SU, SU->TopReadyCycle); +} + +void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) { + if (SU->isScheduled) + return; + + assert(SU->getInstr() && "Scheduled SUnit must have instr"); + + for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) { + unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; + unsigned MinLatency = I->getMinLatency(); +#ifndef NDEBUG + Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency); +#endif + if (SU->BotReadyCycle < SuccReadyCycle + MinLatency) + SU->BotReadyCycle = SuccReadyCycle + MinLatency; + } + Bot.releaseNode(SU, SU->BotReadyCycle); +} + +/// Does this SU have a hazard within the current instruction group. +/// +/// The scheduler supports two modes of hazard recognition. The first is the +/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that +/// supports highly complicated in-order reservation tables +/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic. +/// +/// The second is a streamlined mechanism that checks for hazards based on +/// simple counters that the scheduler itself maintains. It explicitly checks +/// for instruction dispatch limitations, including the number of micro-ops that +/// can dispatch per cycle. +/// +/// TODO: Also check whether the SU must start a new group. +bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) { + if (HazardRec->isEnabled()) + return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; + + if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth()) + return true; + + return false; +} + +void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU, + unsigned ReadyCycle) { + if (ReadyCycle < MinReadyCycle) + MinReadyCycle = ReadyCycle; + + // Check for interlocks first. For the purpose of other heuristics, an + // instruction that cannot issue appears as if it's not in the ReadyQueue. + if (ReadyCycle > CurrCycle || checkHazard(SU)) + + Pending.push(SU); + else + Available.push(SU); +} + +/// Move the boundary of scheduled code by one cycle. +void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() { + unsigned Width = DAG->getIssueWidth(); + IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width; + + assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); + unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); + + if (!HazardRec->isEnabled()) { + // Bypass HazardRec virtual calls. + CurrCycle = NextCycle; + } else { + // Bypass getHazardType calls in case of long latency. + for (; CurrCycle != NextCycle; ++CurrCycle) { + if (isTop()) + HazardRec->AdvanceCycle(); + else + HazardRec->RecedeCycle(); + } + } + CheckPending = true; + + DEBUG(dbgs() << "*** " << Available.getName() << " cycle " + << CurrCycle << '\n'); +} + +/// Move the boundary of scheduled code by one SUnit. +void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) { + bool startNewCycle = false; + + // Update the reservation table. + if (HazardRec->isEnabled()) { + if (!isTop() && SU->isCall) { + // Calls are scheduled with their preceding instructions. For bottom-up + // scheduling, clear the pipeline state before emitting. + HazardRec->Reset(); + } + HazardRec->EmitInstruction(SU); + } + + // Update DFA model. + startNewCycle = ResourceModel->reserveResources(SU); + + // Check the instruction group dispatch limit. + // TODO: Check if this SU must end a dispatch group. + IssueCount += DAG->getNumMicroOps(SU->getInstr()); + if (startNewCycle) { + DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); + bumpCycle(); + } + else + DEBUG(dbgs() << "*** IssueCount " << IssueCount + << " at cycle " << CurrCycle << '\n'); +} + +/// Release pending ready nodes in to the available queue. This makes them +/// visible to heuristics. +void ConvergingVLIWScheduler::SchedBoundary::releasePending() { + // If the available queue is empty, it is safe to reset MinReadyCycle. + if (Available.empty()) + MinReadyCycle = UINT_MAX; + + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + for (unsigned i = 0, e = Pending.size(); i != e; ++i) { + SUnit *SU = *(Pending.begin()+i); + unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; + + if (ReadyCycle < MinReadyCycle) + MinReadyCycle = ReadyCycle; + + if (ReadyCycle > CurrCycle) + continue; + + if (checkHazard(SU)) + continue; + + Available.push(SU); + Pending.remove(Pending.begin()+i); + --i; --e; + } + CheckPending = false; +} + +/// Remove SU from the ready set for this boundary. +void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) { + if (Available.isInQueue(SU)) + Available.remove(Available.find(SU)); + else { + assert(Pending.isInQueue(SU) && "bad ready count"); + Pending.remove(Pending.find(SU)); + } +} + +/// If this queue only has one ready candidate, return it. As a side effect, +/// advance the cycle until at least one node is ready. If multiple instructions +/// are ready, return NULL. +SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() { + if (CheckPending) + releasePending(); + + for (unsigned i = 0; Available.empty(); ++i) { + assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && + "permanent hazard"); (void)i; + bumpCycle(); + releasePending(); + } + if (Available.size() == 1) + return *Available.begin(); + return NULL; +} + +#ifndef NDEBUG +void ConvergingVLIWScheduler::traceCandidate(const char *Label, + const ReadyQueue &Q, + SUnit *SU, PressureElement P) { + dbgs() << Label << " " << Q.getName() << " "; + if (P.isValid()) + dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease + << " "; + else + dbgs() << " "; + SU->dump(DAG); +} +#endif + +/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor +/// of SU, return it, otherwise return null. +static SUnit *getSingleUnscheduledPred(SUnit *SU) { + SUnit *OnlyAvailablePred = 0; + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + SUnit &Pred = *I->getSUnit(); + if (!Pred.isScheduled) { + // We found an available, but not scheduled, predecessor. If it's the + // only one we have found, keep track of it... otherwise give up. + if (OnlyAvailablePred && OnlyAvailablePred != &Pred) + return 0; + OnlyAvailablePred = &Pred; + } + } + return OnlyAvailablePred; +} + +/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor +/// of SU, return it, otherwise return null. +static SUnit *getSingleUnscheduledSucc(SUnit *SU) { + SUnit *OnlyAvailableSucc = 0; + for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) { + SUnit &Succ = *I->getSUnit(); + if (!Succ.isScheduled) { + // We found an available, but not scheduled, successor. If it's the + // only one we have found, keep track of it... otherwise give up. + if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ) + return 0; + OnlyAvailableSucc = &Succ; + } + } + return OnlyAvailableSucc; +} + +// Constants used to denote relative importance of +// heuristic components for cost computation. +static const unsigned PriorityOne = 200; +static const unsigned PriorityTwo = 100; +static const unsigned PriorityThree = 50; +static const unsigned PriorityFour = 20; +static const unsigned ScaleTwo = 10; +static const unsigned FactorOne = 2; + +/// Single point to compute overall scheduling cost. +/// TODO: More heuristics will be used soon. +int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU, + SchedCandidate &Candidate, + RegPressureDelta &Delta, + bool verbose) { + // Initial trivial priority. + int ResCount = 1; + + // Do not waste time on a node that is already scheduled. + if (!SU || SU->isScheduled) + return ResCount; + + // Forced priority is high. + if (SU->isScheduleHigh) + ResCount += PriorityOne; + + // Critical path first. + if (Q.getID() == TopQID) { + ResCount += (SU->getHeight() * ScaleTwo); + + // If resources are available for it, multiply the + // chance of scheduling. + if (Top.ResourceModel->isResourceAvailable(SU)) + ResCount <<= FactorOne; + } else { + ResCount += (SU->getDepth() * ScaleTwo); + + // If resources are available for it, multiply the + // chance of scheduling. + if (Bot.ResourceModel->isResourceAvailable(SU)) + ResCount <<= FactorOne; + } + + unsigned NumNodesBlocking = 0; + if (Q.getID() == TopQID) { + // How many SUs does it block from scheduling? + // Look at all of the successors of this node. + // Count the number of nodes that + // this node is the sole unscheduled node for. + for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) + if (getSingleUnscheduledPred(I->getSUnit()) == SU) + ++NumNodesBlocking; + } else { + // How many unscheduled predecessors block this node? + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) + if (getSingleUnscheduledSucc(I->getSUnit()) == SU) + ++NumNodesBlocking; + } + ResCount += (NumNodesBlocking * ScaleTwo); + + // Factor in reg pressure as a heuristic. + ResCount -= (Delta.Excess.UnitIncrease*PriorityThree); + ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree); + + DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")"); + + return ResCount; +} + +/// Pick the best candidate from the top queue. +/// +/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during +/// DAG building. To adjust for the current scheduling location we need to +/// maintain the number of vreg uses remaining to be top-scheduled. +ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler:: +pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker, + SchedCandidate &Candidate) { + DEBUG(Q.dump()); + + // getMaxPressureDelta temporarily modifies the tracker. + RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); + + // BestSU remains NULL if no top candidates beat the best existing candidate. + CandResult FoundCandidate = NoCand; + for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + RegPressureDelta RPDelta; + TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, + DAG->getRegionCriticalPSets(), + DAG->getRegPressure().MaxSetPressure); + + int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false); + + // Initialize the candidate if needed. + if (!Candidate.SU) { + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = NodeOrder; + continue; + } + + // Best cost. + if (CurrentCost > Candidate.SCost) { + DEBUG(traceCandidate("CCAND", Q, *I)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = BestCost; + continue; + } + + // Fall through to original instruction order. + // Only consider node order if Candidate was chosen from this Q. + if (FoundCandidate == NoCand) + continue; + } + return FoundCandidate; +} + +/// Pick the best candidate node from either the top or bottom queue. +SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) { + // Schedule as far as possible in the direction of no choice. This is most + // efficient, but also provides the best heuristics for CriticalPSets. + if (SUnit *SU = Bot.pickOnlyChoice()) { + IsTopNode = false; + return SU; + } + if (SUnit *SU = Top.pickOnlyChoice()) { + IsTopNode = true; + return SU; + } + SchedCandidate BotCand; + // Prefer bottom scheduling when heuristics are silent. + CandResult BotResult = pickNodeFromQueue(Bot.Available, + DAG->getBotRPTracker(), BotCand); + assert(BotResult != NoCand && "failed to find the first candidate"); + + // If either Q has a single candidate that provides the least increase in + // Excess pressure, we can immediately schedule from that Q. + // + // RegionCriticalPSets summarizes the pressure within the scheduled region and + // affects picking from either Q. If scheduling in one direction must + // increase pressure for one of the excess PSets, then schedule in that + // direction first to provide more freedom in the other direction. + if (BotResult == SingleExcess || BotResult == SingleCritical) { + IsTopNode = false; + return BotCand.SU; + } + // Check if the top Q has a better candidate. + SchedCandidate TopCand; + CandResult TopResult = pickNodeFromQueue(Top.Available, + DAG->getTopRPTracker(), TopCand); + assert(TopResult != NoCand && "failed to find the first candidate"); + + if (TopResult == SingleExcess || TopResult == SingleCritical) { + IsTopNode = true; + return TopCand.SU; + } + // If either Q has a single candidate that minimizes pressure above the + // original region's pressure pick it. + if (BotResult == SingleMax) { + IsTopNode = false; + return BotCand.SU; + } + if (TopResult == SingleMax) { + IsTopNode = true; + return TopCand.SU; + } + if (TopCand.SCost > BotCand.SCost) { + IsTopNode = true; + return TopCand.SU; + } + // Otherwise prefer the bottom candidate in node order. + IsTopNode = false; + return BotCand.SU; +} + +/// Pick the best node to balance the schedule. Implements MachineSchedStrategy. +SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) { + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return NULL; + } + SUnit *SU; + if (ForceTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + SchedCandidate TopCand; + CandResult TopResult = + pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand); + assert(TopResult != NoCand && "failed to find the first candidate"); + (void)TopResult; + SU = TopCand.SU; + } + IsTopNode = true; + } else if (ForceBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + SchedCandidate BotCand; + CandResult BotResult = + pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand); + assert(BotResult != NoCand && "failed to find the first candidate"); + (void)BotResult; + SU = BotCand.SU; + } + IsTopNode = false; + } else { + SU = pickNodeBidrectional(IsTopNode); + } + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom") + << " Scheduling Instruction in cycle " + << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n'; + SU->dump(DAG)); + return SU; +} + +/// Update the scheduler's state after scheduling a node. This is the same node +/// that was just returned by pickNode(). However, VLIWMachineScheduler needs +/// to update it's state based on the current cycle before MachineSchedStrategy +/// does. +void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) { + if (IsTopNode) { + SU->TopReadyCycle = Top.CurrCycle; + Top.bumpNode(SU); + } else { + SU->BotReadyCycle = Bot.CurrCycle; + Bot.bumpNode(SU); + } +} + diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h new file mode 100644 index 0000000..f3643d6 --- /dev/null +++ b/lib/Target/Hexagon/HexagonMachineScheduler.h @@ -0,0 +1,437 @@ +//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler. ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Custom Hexagon MI scheduler. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONASMPRINTER_H +#define HEXAGONASMPRINTER_H + +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/ResourcePriorityQueue.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/PriorityQueue.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// MachineSchedStrategy - Interface to a machine scheduling algorithm. +//===----------------------------------------------------------------------===// + +namespace llvm { +class VLIWMachineScheduler; + +/// MachineSchedStrategy - Interface used by VLIWMachineScheduler to drive +/// the selected scheduling algorithm. +/// +/// TODO: Move this to ScheduleDAGInstrs.h +class MachineSchedStrategy { +public: + virtual ~MachineSchedStrategy() {} + + /// Initialize the strategy after building the DAG for a new region. + virtual void initialize(VLIWMachineScheduler *DAG) = 0; + + /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to + /// schedule the node at the top of the unscheduled region. Otherwise it will + /// be scheduled at the bottom. + virtual SUnit *pickNode(bool &IsTopNode) = 0; + + /// Notify MachineSchedStrategy that VLIWMachineScheduler has + /// scheduled a node. + virtual void schedNode(SUnit *SU, bool IsTopNode) = 0; + + /// When all predecessor dependencies have been resolved, free this node for + /// top-down scheduling. + virtual void releaseTopNode(SUnit *SU) = 0; + /// When all successor dependencies have been resolved, free this node for + /// bottom-up scheduling. + virtual void releaseBottomNode(SUnit *SU) = 0; +}; + +//===----------------------------------------------------------------------===// +// ConvergingVLIWScheduler - Implementation of the standard +// MachineSchedStrategy. +//===----------------------------------------------------------------------===// + +/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience +/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified +/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in. +class ReadyQueue { + unsigned ID; + std::string Name; + std::vector<SUnit*> Queue; + +public: + ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {} + + unsigned getID() const { return ID; } + + StringRef getName() const { return Name; } + + // SU is in this queue if it's NodeQueueID is a superset of this ID. + bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); } + + bool empty() const { return Queue.empty(); } + + unsigned size() const { return Queue.size(); } + + typedef std::vector<SUnit*>::iterator iterator; + + iterator begin() { return Queue.begin(); } + + iterator end() { return Queue.end(); } + + iterator find(SUnit *SU) { + return std::find(Queue.begin(), Queue.end(), SU); + } + + void push(SUnit *SU) { + Queue.push_back(SU); + SU->NodeQueueId |= ID; + } + + void remove(iterator I) { + (*I)->NodeQueueId &= ~ID; + *I = Queue.back(); + Queue.pop_back(); + } + + void dump() { + dbgs() << Name << ": "; + for (unsigned i = 0, e = Queue.size(); i < e; ++i) + dbgs() << Queue[i]->NodeNum << " "; + dbgs() << "\n"; + } +}; + +class VLIWResourceModel { + /// ResourcesModel - Represents VLIW state. + /// Not limited to VLIW targets per say, but assumes + /// definition of DFA by a target. + DFAPacketizer *ResourcesModel; + + const InstrItineraryData *InstrItins; + + /// Local packet/bundle model. Purely + /// internal to the MI schedulre at the time. + std::vector<SUnit*> Packet; + + /// Total packets created. + unsigned TotalPackets; + +public: + VLIWResourceModel(MachineSchedContext *C, const InstrItineraryData *IID) : + InstrItins(IID), TotalPackets(0) { + const TargetMachine &TM = C->MF->getTarget(); + ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL); + + // This hard requirement could be relaxed, + // but for now do not let it proceed. + assert(ResourcesModel && "Unimplemented CreateTargetScheduleState."); + + Packet.resize(InstrItins->SchedModel->IssueWidth); + Packet.clear(); + ResourcesModel->clearResources(); + } + + VLIWResourceModel(const TargetMachine &TM) : + InstrItins(TM.getInstrItineraryData()), TotalPackets(0) { + ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL); + + // This hard requirement could be relaxed, + // but for now do not let it proceed. + assert(ResourcesModel && "Unimplemented CreateTargetScheduleState."); + + Packet.resize(InstrItins->SchedModel->IssueWidth); + Packet.clear(); + ResourcesModel->clearResources(); + } + + ~VLIWResourceModel() { + delete ResourcesModel; + } + + void resetPacketState() { + Packet.clear(); + } + + void resetDFA() { + ResourcesModel->clearResources(); + } + + void reset() { + Packet.clear(); + ResourcesModel->clearResources(); + } + + bool isResourceAvailable(SUnit *SU); + bool reserveResources(SUnit *SU); + unsigned getTotalPackets() const { return TotalPackets; } +}; + +class VLIWMachineScheduler : public ScheduleDAGInstrs { + /// AA - AliasAnalysis for making memory reference queries. + AliasAnalysis *AA; + + RegisterClassInfo *RegClassInfo; + MachineSchedStrategy *SchedImpl; + + MachineBasicBlock::iterator LiveRegionEnd; + + /// Register pressure in this region computed by buildSchedGraph. + IntervalPressure RegPressure; + RegPressureTracker RPTracker; + + /// List of pressure sets that exceed the target's pressure limit before + /// scheduling, listed in increasing set ID order. Each pressure set is paired + /// with its max pressure in the currently scheduled regions. + std::vector<PressureElement> RegionCriticalPSets; + + /// The top of the unscheduled zone. + MachineBasicBlock::iterator CurrentTop; + IntervalPressure TopPressure; + RegPressureTracker TopRPTracker; + + /// The bottom of the unscheduled zone. + MachineBasicBlock::iterator CurrentBottom; + IntervalPressure BotPressure; + RegPressureTracker BotRPTracker; + +#ifndef NDEBUG + /// The number of instructions scheduled so far. Used to cut off the + /// scheduler at the point determined by misched-cutoff. + unsigned NumInstrsScheduled; +#endif + + /// Total packets in the region. + unsigned TotalPackets; + + const MachineLoopInfo *MLI; +public: + VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S): + ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS), + AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S), + RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure), + CurrentBottom(), BotRPTracker(BotPressure), MLI(C->MLI) { +#ifndef NDEBUG + NumInstrsScheduled = 0; +#endif + TotalPackets = 0; + } + + virtual ~VLIWMachineScheduler() { + delete SchedImpl; + } + + MachineBasicBlock::iterator top() const { return CurrentTop; } + MachineBasicBlock::iterator bottom() const { return CurrentBottom; } + + /// Implement the ScheduleDAGInstrs interface for handling the next scheduling + /// region. This covers all instructions in a block, while schedule() may only + /// cover a subset. + void enterRegion(MachineBasicBlock *bb, + MachineBasicBlock::iterator begin, + MachineBasicBlock::iterator end, + unsigned endcount); + + /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's + /// time to do some work. + void schedule(); + + unsigned CurCycle; + + /// Get current register pressure for the top scheduled instructions. + const IntervalPressure &getTopPressure() const { return TopPressure; } + const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; } + + /// Get current register pressure for the bottom scheduled instructions. + const IntervalPressure &getBotPressure() const { return BotPressure; } + const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; } + + /// Get register pressure for the entire scheduling region before scheduling. + const IntervalPressure &getRegPressure() const { return RegPressure; } + + const std::vector<PressureElement> &getRegionCriticalPSets() const { + return RegionCriticalPSets; + } + + /// getIssueWidth - Return the max instructions per scheduling group. + unsigned getIssueWidth() const { + return (InstrItins && InstrItins->SchedModel) + ? InstrItins->SchedModel->IssueWidth : 1; + } + + /// getNumMicroOps - Return the number of issue slots required for this MI. + unsigned getNumMicroOps(MachineInstr *MI) const { + return 1; + //if (!InstrItins) return 1; + //int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass()); + //return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI); + } + +private: + void scheduleNodeTopDown(SUnit *SU); + void listScheduleTopDown(); + + void initRegPressure(); + void updateScheduledPressure(std::vector<unsigned> NewMaxPressure); + + void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos); + bool checkSchedLimit(); + + void releaseRoots(); + + void releaseSucc(SUnit *SU, SDep *SuccEdge); + void releaseSuccessors(SUnit *SU); + void releasePred(SUnit *SU, SDep *PredEdge); + void releasePredecessors(SUnit *SU); + + void placeDebugValues(); +}; + +/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics +/// to balance the schedule. +class ConvergingVLIWScheduler : public MachineSchedStrategy { + + /// Store the state used by ConvergingVLIWScheduler heuristics, required + /// for the lifetime of one invocation of pickNode(). + struct SchedCandidate { + // The best SUnit candidate. + SUnit *SU; + + // Register pressure values for the best candidate. + RegPressureDelta RPDelta; + + // Best scheduling cost. + int SCost; + + SchedCandidate(): SU(NULL), SCost(0) {} + }; + /// Represent the type of SchedCandidate found within a single queue. + enum CandResult { + NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure, + BestCost}; + + /// Each Scheduling boundary is associated with ready queues. It tracks the + /// current cycle in whichever direction at has moved, and maintains the state + /// of "hazards" and other interlocks at the current cycle. + struct SchedBoundary { + VLIWMachineScheduler *DAG; + + ReadyQueue Available; + ReadyQueue Pending; + bool CheckPending; + + ScheduleHazardRecognizer *HazardRec; + VLIWResourceModel *ResourceModel; + + unsigned CurrCycle; + unsigned IssueCount; + + /// MinReadyCycle - Cycle of the soonest available instruction. + unsigned MinReadyCycle; + + // Remember the greatest min operand latency. + unsigned MaxMinLatency; + + /// Pending queues extend the ready queues with the same ID and the + /// PendingFlag set. + SchedBoundary(unsigned ID, const Twine &Name): + DAG(0), Available(ID, Name+".A"), + Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"), + CheckPending(false), HazardRec(0), ResourceModel(0), + CurrCycle(0), IssueCount(0), + MinReadyCycle(UINT_MAX), MaxMinLatency(0) {} + + ~SchedBoundary() { + delete ResourceModel; + delete HazardRec; + } + + bool isTop() const { + return Available.getID() == ConvergingVLIWScheduler::TopQID; + } + + bool checkHazard(SUnit *SU); + + void releaseNode(SUnit *SU, unsigned ReadyCycle); + + void bumpCycle(); + + void bumpNode(SUnit *SU); + + void releasePending(); + + void removeReady(SUnit *SU); + + SUnit *pickOnlyChoice(); + }; + + VLIWMachineScheduler *DAG; + const TargetRegisterInfo *TRI; + + // State of the top and bottom scheduled instruction boundaries. + SchedBoundary Top; + SchedBoundary Bot; + +public: + /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) + enum { + TopQID = 1, + BotQID = 2, + LogMaxQID = 2 + }; + + ConvergingVLIWScheduler(): + DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} + + virtual void initialize(VLIWMachineScheduler *dag); + + virtual SUnit *pickNode(bool &IsTopNode); + + virtual void schedNode(SUnit *SU, bool IsTopNode); + + virtual void releaseTopNode(SUnit *SU); + + virtual void releaseBottomNode(SUnit *SU); + +protected: + SUnit *pickNodeBidrectional(bool &IsTopNode); + + int SchedulingCost(ReadyQueue &Q, + SUnit *SU, SchedCandidate &Candidate, + RegPressureDelta &Delta, bool verbose); + + CandResult pickNodeFromQueue(ReadyQueue &Q, + const RegPressureTracker &RPTracker, + SchedCandidate &Candidate); +#ifndef NDEBUG + void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU, + PressureElement P = PressureElement()); +#endif +}; + +} // namespace + + +#endif diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp index 7ece408..1e91c39 100644 --- a/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -337,7 +337,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n" << "********** Function: " - << MF.getFunction()->getName() << "\n"); + << MF.getName() << "\n"); #if 0 // for now disable this, if we move NewValueJump before register diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp index 55cbc09..a295015 100644 --- a/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/lib/Target/Hexagon/HexagonPeephole.cpp @@ -109,6 +109,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); DenseMap<unsigned, unsigned> PeepholeMap; + DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap; if (DisableHexagonPeephole) return false; @@ -117,6 +118,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { MBBb != MBBe; ++MBBb) { MachineBasicBlock* MBB = MBBb; PeepholeMap.clear(); + PeepholeDoubleRegsMap.clear(); // Traverse the basic block. for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); @@ -140,6 +142,24 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { } } + // Look for this sequence below + // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32 + // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg. + // and convert into + // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg. + if (MI->getOpcode() == Hexagon::LSRd_ri) { + assert(MI->getNumOperands() == 3); + MachineOperand &Dst = MI->getOperand(0); + MachineOperand &Src1 = MI->getOperand(1); + MachineOperand &Src2 = MI->getOperand(2); + if (Src2.getImm() != 32) + continue; + unsigned DstReg = Dst.getReg(); + unsigned SrcReg = Src1.getReg(); + PeepholeDoubleRegsMap[DstReg] = + std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/); + } + // Look for P=NOT(P). if (!DisablePNotP && (MI->getOpcode() == Hexagon::NOT_p)) { @@ -178,6 +198,21 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { // Change the 1st operand. MI->RemoveOperand(1); MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false)); + } else { + DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI = + PeepholeDoubleRegsMap.find(SrcReg); + if (DI != PeepholeDoubleRegsMap.end()) { + std::pair<unsigned,unsigned> PeepholeSrc = DI->second; + MI->RemoveOperand(1); + MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first, + false /*isDef*/, + false /*isImp*/, + false /*isKill*/, + false /*isDead*/, + false /*isUndef*/, + false /*isEarlyClobber*/, + PeepholeSrc.second)); + } } } } diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 2c23674..3742486 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -310,6 +310,58 @@ void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove> Moves.push_back(MachineMove(0, Dst, Src)); } +// Get the weight in units of pressure for this register class. +const RegClassWeight & +HexagonRegisterInfo::getRegClassWeight(const TargetRegisterClass *RC) const { + // Each TargetRegisterClass has a per register weight, and weight + // limit which must be less than the limits of its pressure sets. + static const RegClassWeight RCWeightTable[] = { + {1, 32}, // IntRegs + {1, 8}, // CRRegs + {1, 4}, // PredRegs + {2, 16}, // DoubleRegs + {0, 0} }; + return RCWeightTable[RC->getID()]; +} + +/// Get the number of dimensions of register pressure. +unsigned HexagonRegisterInfo::getNumRegPressureSets() const { + return 4; +} + +/// Get the name of this register unit pressure set. +const char *HexagonRegisterInfo::getRegPressureSetName(unsigned Idx) const { + static const char *const RegPressureSetName[] = { + "IntRegsRegSet", + "CRRegsRegSet", + "PredRegsRegSet", + "DoubleRegsRegSet" + }; + assert((Idx < 4) && "Index out of bounds"); + return RegPressureSetName[Idx]; +} + +/// Get the register unit pressure limit for this dimension. +/// This limit must be adjusted dynamically for reserved registers. +unsigned HexagonRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { + static const int RegPressureLimit [] = { 16, 4, 2, 8 }; + assert((Idx < 4) && "Index out of bounds"); + return RegPressureLimit[Idx]; +} + +const int* +HexagonRegisterInfo::getRegClassPressureSets(const TargetRegisterClass *RC) + const { + static const int RCSetsTable[] = { + 0, -1, // IntRegs + 1, -1, // CRRegs + 2, -1, // PredRegs + 0, -1, // DoubleRegs + -1 }; + static const unsigned RCSetStartTable[] = { 0, 2, 4, 6, 0 }; + unsigned SetListStart = RCSetStartTable[RC->getID()]; + return &RCSetsTable[SetListStart]; +} unsigned HexagonRegisterInfo::getEHExceptionRegister() const { llvm_unreachable("What is the exception register"); } diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h index 85355ae..8820d13 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -87,6 +87,11 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo { // Exception handling queries. unsigned getEHExceptionRegister() const; unsigned getEHHandlerRegister() const; + const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const; + unsigned getNumRegPressureSets() const; + const char *getRegPressureSetName(unsigned Idx) const; + unsigned getRegPressureSetLimit(unsigned Idx) const; + const int* getRegClassPressureSets(const TargetRegisterClass *RC) const; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td index d1076b8..b5ff69a 100644 --- a/lib/Target/Hexagon/HexagonSchedule.td +++ b/lib/Target/Hexagon/HexagonSchedule.td @@ -47,6 +47,7 @@ def HexagonModel : SchedMachineModel { // Max issue per cycle == bundle width. let IssueWidth = 4; let Itineraries = HexagonItineraries; + let LoadLatency = 1; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td index 9b41126..5668ae8 100644 --- a/lib/Target/Hexagon/HexagonScheduleV4.td +++ b/lib/Target/Hexagon/HexagonScheduleV4.td @@ -58,6 +58,7 @@ def HexagonModelV4 : SchedMachineModel { // Max issue per cycle == bundle width. let IssueWidth = 4; let Itineraries = HexagonItinerariesV4; + let LoadLatency = 1; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index a7b291f..5688e9c 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -14,6 +14,7 @@ #include "HexagonTargetMachine.h" #include "Hexagon.h" #include "HexagonISelLowering.h" +#include "HexagonMachineScheduler.h" #include "llvm/Module.h" #include "llvm/CodeGen/Passes.h" #include "llvm/PassManager.h" @@ -29,6 +30,11 @@ opt<bool> DisableHardwareLoops( "disable-hexagon-hwloops", cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target")); +static cl:: +opt<bool> DisableHexagonMISched("disable-hexagon-misched", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Disable Hexagon MI Scheduling")); + /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the /// library. In particular, it seems that it is not possible to get @@ -42,6 +48,13 @@ extern "C" void LLVMInitializeHexagonTarget() { RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget); } +static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) { + return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler()); +} + +static MachineSchedRegistry +SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", + createVLIWMachineSched); /// HexagonTargetMachine ctor - Create an ILP32 architecture model. /// @@ -83,7 +96,13 @@ namespace { class HexagonPassConfig : public TargetPassConfig { public: HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + // Enable MI scheduler. + if (!DisableHexagonMISched) { + enablePass(&MachineSchedulerID); + MachineSchedRegistry::setDefault(createVLIWMachineSched); + } + } HexagonTargetMachine &getHexagonTargetMachine() const { return getTM<HexagonTargetMachine>(); diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index a03ed03..3d5f685 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -3474,8 +3474,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { // 1. Two loads unless they are volatile. // 2. Two stores in V4 unless they are volatile. else if ((DepType == SDep::Order) && - !I->hasVolatileMemoryRef() && - !J->hasVolatileMemoryRef()) { + !I->hasOrderedMemoryRef() && + !J->hasOrderedMemoryRef()) { if (QRI->Subtarget.hasV4TOps() && // hexagonv4 allows dual store. MCIDI.mayStore() && MCIDJ.mayStore()) { diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp index d6e6c36..86f75d1 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp @@ -24,7 +24,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) { HasLEB128 = true; PrivateGlobalPrefix = ".L"; - LCOMMDirectiveType = LCOMM::ByteAlignment; + LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment; InlineAsmStart = "# InlineAsm Start"; InlineAsmEnd = "# InlineAsm End"; ZeroDirective = "\t.space\t"; diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp index 38fb0e8..4059403 100644 --- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp +++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp @@ -56,6 +56,12 @@ class MBlazeAsmParser : public MCTargetAsmParser { /// } + unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + unsigned OperandNum, unsigned &NumMCOperands) { + return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, + NumMCOperands); + } public: MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser) @@ -317,10 +323,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out) { MCInst Inst; - SMLoc ErrorLoc; + unsigned Kind; unsigned ErrorInfo; - switch (MatchInstructionImpl(Operands, Inst, ErrorInfo)) { + switch (MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo)) { default: break; case Match_Success: Out.EmitInstruction(Inst); @@ -329,10 +335,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, return Error(IDLoc, "instruction use requires an option to be enabled"); case Match_MnemonicFail: return Error(IDLoc, "unrecognized instruction mnemonic"); - case Match_ConversionFail: - return Error(IDLoc, "unable to convert operands to instruction"); - case Match_InvalidOperand: - ErrorLoc = IDLoc; + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; if (ErrorInfo != ~0U) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); @@ -343,6 +347,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, return Error(ErrorLoc, "invalid operand for instruction"); } + } llvm_unreachable("Implement any new match types added!"); } diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp index 46f5207..daa76e8 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp @@ -140,7 +140,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned oi = i == 2 ? 1 : 2; - DEBUG(dbgs() << "\nFunction : " << MF.getFunction()->getName() << "\n"; + DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n"; dbgs() << "<--------->\n" << MI); int FrameIndex = MI.getOperand(i).getIndex(); diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 43bd345..8418b75 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -8,20 +8,36 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/MipsMCTargetDesc.h" +#include "MipsRegisterInfo.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/MC/MCTargetAsmParser.h" -#include "llvm/Support/TargetRegistry.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCTargetAsmParser.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetRegistry.h" using namespace llvm; namespace { + class MipsAsmParser : public MCTargetAsmParser { + enum FpFormatTy { + FP_FORMAT_NONE = -1, + FP_FORMAT_S, + FP_FORMAT_D, + FP_FORMAT_L, + FP_FORMAT_W + } FpFormat; + + MCSubtargetInfo &STI; + MCAsmParser &Parser; + #define GET_ASSEMBLER_HEADER #include "MipsGenAsmMatcher.inc" @@ -34,14 +50,67 @@ class MipsAsmParser : public MCTargetAsmParser { bool ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands); + bool parseMathOperation(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands); + bool ParseDirective(AsmToken DirectiveID); - OperandMatchResultTy parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&); + MipsAsmParser::OperandMatchResultTy + parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&); + + unsigned + getMCInstOperandNum(unsigned Kind, MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + unsigned OperandNum, unsigned &NumMCOperands); + + bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, + StringRef Mnemonic); + + int tryParseRegister(StringRef Mnemonic); + + bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, + StringRef Mnemonic); + + bool parseMemOffset(const MCExpr *&Res); + bool parseRelocOperand(const MCExpr *&Res); + MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol); + + bool isMips64() const { + return (STI.getFeatureBits() & Mips::FeatureMips64) != 0; + } + + bool isFP64() const { + return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0; + } + + int matchRegisterName(StringRef Symbol); + + int matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic); + + void setFpFormat(FpFormatTy Format) { + FpFormat = Format; + } + + void setDefaultFpFormat(); + + void setFpFormat(StringRef Format); + + FpFormatTy getFpFormat() {return FpFormat;} + + bool requestsDoubleOperand(StringRef Mnemonic); + + unsigned getReg(int RC,int RegNo); + public: MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) - : MCTargetAsmParser() { + : MCTargetAsmParser(), STI(sti), Parser(parser) { + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } + MCAsmParser &getParser() const { return Parser; } + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + }; } @@ -50,6 +119,7 @@ namespace { /// MipsOperand - Instances of this class represent a parsed Mips machine /// instruction. class MipsOperand : public MCParsedAsmOperand { + enum KindTy { k_CondCode, k_CoprocNum, @@ -61,18 +131,58 @@ class MipsOperand : public MCParsedAsmOperand { } Kind; MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + union { + struct { + const char *Data; + unsigned Length; + } Tok; + + struct { + unsigned RegNum; + } Reg; + + struct { + const MCExpr *Val; + } Imm; + + struct { + unsigned Base; + const MCExpr *Off; + } Mem; + }; + + SMLoc StartLoc, EndLoc; + public: void addRegOperands(MCInst &Inst, unsigned N) const { - llvm_unreachable("unimplemented!"); + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); } + void addExpr(MCInst &Inst, const MCExpr *Expr) const{ - llvm_unreachable("unimplemented!"); + // Add as immediate when possible. Null MCExpr = 0. + if (Expr == 0) + Inst.addOperand(MCOperand::CreateImm(0)); + else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(Expr)); } + void addImmOperands(MCInst &Inst, unsigned N) const { - llvm_unreachable("unimplemented!"); + assert(N == 1 && "Invalid number of operands!"); + const MCExpr *Expr = getImm(); + addExpr(Inst,Expr); } + void addMemOperands(MCInst &Inst, unsigned N) const { - llvm_unreachable("unimplemented!"); + assert(N == 2 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::CreateReg(getMemBase())); + + const MCExpr *Expr = getMemOff(); + addExpr(Inst,Expr); } bool isReg() const { return Kind == k_Register; } @@ -82,46 +192,751 @@ public: StringRef getToken() const { assert(Kind == k_Token && "Invalid access!"); - return ""; + return StringRef(Tok.Data, Tok.Length); } unsigned getReg() const { assert((Kind == k_Register) && "Invalid access!"); - return 0; + return Reg.RegNum; + } + + const MCExpr *getImm() const { + assert((Kind == k_Immediate) && "Invalid access!"); + return Imm.Val; } + unsigned getMemBase() const { + assert((Kind == k_Memory) && "Invalid access!"); + return Mem.Base; + } + + const MCExpr *getMemOff() const { + assert((Kind == k_Memory) && "Invalid access!"); + return Mem.Off; + } + + static MipsOperand *CreateToken(StringRef Str, SMLoc S) { + MipsOperand *Op = new MipsOperand(k_Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static MipsOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { + MipsOperand *Op = new MipsOperand(k_Register); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { + MipsOperand *Op = new MipsOperand(k_Immediate); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off, + SMLoc S, SMLoc E) { + MipsOperand *Op = new MipsOperand(k_Memory); + Op->Mem.Base = Base; + Op->Mem.Off = Off; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + virtual void print(raw_ostream &OS) const { llvm_unreachable("unimplemented!"); } }; } +unsigned MipsAsmParser:: +getMCInstOperandNum(unsigned Kind, MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + unsigned OperandNum, unsigned &NumMCOperands) { + assert (0 && "getMCInstOperandNum() not supported by the Mips target."); + // The Mips backend doesn't currently include the matcher implementation, so + // the getMCInstOperandNumImpl() is undefined. This is a temporary + // work around. + NumMCOperands = 0; + return 0; +} + bool MipsAsmParser:: MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out) { + MCInst Inst; + unsigned ErrorInfo; + unsigned Kind; + unsigned MatchResult = MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo); + + switch (MatchResult) { + default: break; + case Match_Success: { + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst); + return false; + } + case Match_MissingFeature: + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((MipsOperand*)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + case Match_MnemonicFail: + return Error(IDLoc, "invalid instruction"); + } return true; } +int MipsAsmParser::matchRegisterName(StringRef Name) { + + int CC = StringSwitch<unsigned>(Name) + .Case("zero", Mips::ZERO) + .Case("a0", Mips::A0) + .Case("a1", Mips::A1) + .Case("a2", Mips::A2) + .Case("a3", Mips::A3) + .Case("v0", Mips::V0) + .Case("v1", Mips::V1) + .Case("s0", Mips::S0) + .Case("s1", Mips::S1) + .Case("s2", Mips::S2) + .Case("s3", Mips::S3) + .Case("s4", Mips::S4) + .Case("s5", Mips::S5) + .Case("s6", Mips::S6) + .Case("s7", Mips::S7) + .Case("k0", Mips::K0) + .Case("k1", Mips::K1) + .Case("sp", Mips::SP) + .Case("fp", Mips::FP) + .Case("gp", Mips::GP) + .Case("ra", Mips::RA) + .Case("t0", Mips::T0) + .Case("t1", Mips::T1) + .Case("t2", Mips::T2) + .Case("t3", Mips::T3) + .Case("t4", Mips::T4) + .Case("t5", Mips::T5) + .Case("t6", Mips::T6) + .Case("t7", Mips::T7) + .Case("t8", Mips::T8) + .Case("t9", Mips::T9) + .Case("at", Mips::AT) + .Case("fcc0", Mips::FCC0) + .Default(-1); + + if (CC != -1) { + //64 bit register in Mips are following 32 bit definitions. + if (isMips64()) + CC++; + return CC; + } + + if (Name[0] == 'f') { + StringRef NumString = Name.substr(1); + unsigned IntVal; + if( NumString.getAsInteger(10, IntVal)) + return -1; //not integer + if (IntVal > 31) + return -1; + + FpFormatTy Format = getFpFormat(); + + if (Format == FP_FORMAT_S || Format == FP_FORMAT_W) + return getReg(Mips::FGR32RegClassID, IntVal); + if (Format == FP_FORMAT_D) { + if(isFP64()) { + return getReg(Mips::FGR64RegClassID, IntVal); + } + //only even numbers available as register pairs + if (( IntVal > 31) || (IntVal%2 != 0)) + return -1; + return getReg(Mips::AFGR64RegClassID, IntVal/2); + } + } + + return -1; +} +void MipsAsmParser::setDefaultFpFormat() { + + if (isMips64() || isFP64()) + FpFormat = FP_FORMAT_D; + else + FpFormat = FP_FORMAT_S; +} + +bool MipsAsmParser::requestsDoubleOperand(StringRef Mnemonic){ + + bool IsDouble = StringSwitch<bool>(Mnemonic.lower()) + .Case("ldxc1", true) + .Case("ldc1", true) + .Case("sdxc1", true) + .Case("sdc1", true) + .Default(false); + + return IsDouble; +} +void MipsAsmParser::setFpFormat(StringRef Format) { + + FpFormat = StringSwitch<FpFormatTy>(Format.lower()) + .Case(".s", FP_FORMAT_S) + .Case(".d", FP_FORMAT_D) + .Case(".l", FP_FORMAT_L) + .Case(".w", FP_FORMAT_W) + .Default(FP_FORMAT_NONE); +} + +unsigned MipsAsmParser::getReg(int RC,int RegNo){ + return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo); +} + +int MipsAsmParser::matchRegisterByNumber(unsigned RegNum,StringRef Mnemonic) { + + if (Mnemonic.lower() == "rdhwr") { + //at the moment only hwreg29 is supported + if (RegNum != 29) + return -1; + return Mips::HWR29; + } + + if (RegNum > 31) + return -1; + + return getReg(Mips::CPURegsRegClassID,RegNum); +} + +int MipsAsmParser::tryParseRegister(StringRef Mnemonic) { + const AsmToken &Tok = Parser.getTok(); + int RegNum = -1; + + if (Tok.is(AsmToken::Identifier)) { + std::string lowerCase = Tok.getString().lower(); + RegNum = matchRegisterName(lowerCase); + } else if (Tok.is(AsmToken::Integer)) + RegNum = matchRegisterByNumber(static_cast<unsigned> (Tok.getIntVal()), + Mnemonic.lower()); + else + return RegNum; //error + //64 bit div operations require Mips::ZERO instead of MIPS::ZERO_64 + if (isMips64() && RegNum == Mips::ZERO_64) { + if (Mnemonic.find("ddiv") != StringRef::npos) + RegNum = Mips::ZERO; + } + return RegNum; +} + bool MipsAsmParser:: -ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, + StringRef Mnemonic){ + + SMLoc S = Parser.getTok().getLoc(); + int RegNo = -1; + + //FIXME: we should make a more generic method for CCR + if ((Mnemonic == "cfc1" || Mnemonic == "ctc1") + && Operands.size() == 2 && Parser.getTok().is(AsmToken::Integer)){ + RegNo = Parser.getTok().getIntVal(); //get the int value + //at the moment only fcc0 is supported + if (RegNo == 0) + RegNo = Mips::FCC0; + } else + RegNo = tryParseRegister(Mnemonic); + if (RegNo == -1) + return true; + + Operands.push_back(MipsOperand::CreateReg(RegNo, S, + Parser.getTok().getLoc())); + Parser.Lex(); // Eat register token. + return false; +} + +bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands, + StringRef Mnemonic) { + //Check if the current operand has a custom associated parser, if so, try to + //custom parse the operand, or fallback to the general approach. + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + if (ResTy == MatchOperand_Success) + return false; + // If there wasn't a custom match, try the generic matcher below. Otherwise, + // there was a match, but an error occurred, in which case, just return that + // the operand parsing failed. + if (ResTy == MatchOperand_ParseFail) + return true; + + switch (getLexer().getKind()) { + default: + Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return true; + case AsmToken::Dollar: { + //parse register + SMLoc S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat dollar token. + //parse register operand + if (!tryParseRegisterOperand(Operands,Mnemonic)) { + if (getLexer().is(AsmToken::LParen)) { + //check if it is indexed addressing operand + Operands.push_back(MipsOperand::CreateToken("(", S)); + Parser.Lex(); //eat parenthesis + if (getLexer().isNot(AsmToken::Dollar)) + return true; + + Parser.Lex(); //eat dollar + if (tryParseRegisterOperand(Operands,Mnemonic)) + return true; + + if (!getLexer().is(AsmToken::RParen)) + return true; + + S = Parser.getTok().getLoc(); + Operands.push_back(MipsOperand::CreateToken(")", S)); + Parser.Lex(); + } + return false; + } + //maybe it is a symbol reference + StringRef Identifier; + if (Parser.ParseIdentifier(Identifier)) + return true; + + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + + MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier); + + // Otherwise create a symbol ref. + const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, + getContext()); + + Operands.push_back(MipsOperand::CreateImm(Res, S, E)); + return false; + } + case AsmToken::Identifier: + case AsmToken::LParen: + case AsmToken::Minus: + case AsmToken::Plus: + case AsmToken::Integer: + case AsmToken::String: { + // quoted label names + const MCExpr *IdVal; + SMLoc S = Parser.getTok().getLoc(); + if (getParser().ParseExpression(IdVal)) + return true; + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(MipsOperand::CreateImm(IdVal, S, E)); + return false; + } + case AsmToken::Percent: { + //it is a symbol reference or constant expression + const MCExpr *IdVal; + SMLoc S = Parser.getTok().getLoc(); //start location of the operand + if (parseRelocOperand(IdVal)) + return true; + + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + + Operands.push_back(MipsOperand::CreateImm(IdVal, S, E)); + return false; + }//case AsmToken::Percent + }//switch(getLexer().getKind()) + return true; +} + +bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) { + + Parser.Lex(); //eat % token + const AsmToken &Tok = Parser.getTok(); //get next token, operation + if (Tok.isNot(AsmToken::Identifier)) + return true; + + std::string Str = Tok.getIdentifier().str(); + + Parser.Lex(); //eat identifier + //now make expression from the rest of the operand + const MCExpr *IdVal; + SMLoc EndLoc; + + if (getLexer().getKind() == AsmToken::LParen) { + while (1) { + Parser.Lex(); //eat '(' token + if (getLexer().getKind() == AsmToken::Percent) { + Parser.Lex(); //eat % token + const AsmToken &nextTok = Parser.getTok(); + if (nextTok.isNot(AsmToken::Identifier)) + return true; + Str += "(%"; + Str += nextTok.getIdentifier(); + Parser.Lex(); //eat identifier + if (getLexer().getKind() != AsmToken::LParen) + return true; + } else + break; + } + if (getParser().ParseParenExpression(IdVal,EndLoc)) + return true; + + while (getLexer().getKind() == AsmToken::RParen) + Parser.Lex(); //eat ')' token + + } else + return true; //parenthesis must follow reloc operand + + //Check the type of the expression + if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal)) { + //it's a constant, evaluate lo or hi value + int Val = MCE->getValue(); + if (Str == "lo") { + Val = Val & 0xffff; + } else if (Str == "hi") { + Val = (Val & 0xffff0000) >> 16; + } + Res = MCConstantExpr::Create(Val, getContext()); + return false; + } + + if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(IdVal)) { + //it's a symbol, create symbolic expression from symbol + StringRef Symbol = MSRE->getSymbol().getName(); + MCSymbolRefExpr::VariantKind VK = getVariantKind(Str); + Res = MCSymbolRefExpr::Create(Symbol,VK,getContext()); + return false; + } return true; } +bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { + + StartLoc = Parser.getTok().getLoc(); + RegNo = tryParseRegister(""); + EndLoc = Parser.getTok().getLoc(); + return (RegNo == (unsigned)-1); +} + +bool MipsAsmParser::parseMemOffset(const MCExpr *&Res) { + + SMLoc S; + + switch(getLexer().getKind()) { + default: + return true; + case AsmToken::Integer: + case AsmToken::Minus: + case AsmToken::Plus: + return (getParser().ParseExpression(Res)); + case AsmToken::Percent: + return parseRelocOperand(Res); + case AsmToken::LParen: + return false; //it's probably assuming 0 + } + return true; +} + +MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand( + SmallVectorImpl<MCParsedAsmOperand*>&Operands) { + + const MCExpr *IdVal = 0; + SMLoc S; + //first operand is the offset + S = Parser.getTok().getLoc(); + + if (parseMemOffset(IdVal)) + return MatchOperand_ParseFail; + + const AsmToken &Tok = Parser.getTok(); //get next token + if (Tok.isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "'(' expected"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat '(' token. + + const AsmToken &Tok1 = Parser.getTok(); //get next token + if (Tok1.is(AsmToken::Dollar)) { + Parser.Lex(); // Eat '$' token. + if (tryParseRegisterOperand(Operands,"")) { + Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return MatchOperand_ParseFail; + } + + } else { + Error(Parser.getTok().getLoc(),"unexpected token in operand"); + return MatchOperand_ParseFail; + } + + const AsmToken &Tok2 = Parser.getTok(); //get next token + if (Tok2.isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "')' expected"); + return MatchOperand_ParseFail; + } + + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + + Parser.Lex(); // Eat ')' token. + + if (IdVal == 0) + IdVal = MCConstantExpr::Create(0, getContext()); + + //now replace register operand with the mem operand + MipsOperand* op = static_cast<MipsOperand*>(Operands.back()); + int RegNo = op->getReg(); + //remove register from operands + Operands.pop_back(); + //and add memory operand + Operands.push_back(MipsOperand::CreateMem(RegNo, IdVal, S, E)); + delete op; + return MatchOperand_Success; +} + +MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) { + + MCSymbolRefExpr::VariantKind VK + = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol) + .Case("hi", MCSymbolRefExpr::VK_Mips_ABS_HI) + .Case("lo", MCSymbolRefExpr::VK_Mips_ABS_LO) + .Case("gp_rel", MCSymbolRefExpr::VK_Mips_GPREL) + .Case("call16", MCSymbolRefExpr::VK_Mips_GOT_CALL) + .Case("got", MCSymbolRefExpr::VK_Mips_GOT) + .Case("tlsgd", MCSymbolRefExpr::VK_Mips_TLSGD) + .Case("tlsldm", MCSymbolRefExpr::VK_Mips_TLSLDM) + .Case("dtprel_hi", MCSymbolRefExpr::VK_Mips_DTPREL_HI) + .Case("dtprel_lo", MCSymbolRefExpr::VK_Mips_DTPREL_LO) + .Case("gottprel", MCSymbolRefExpr::VK_Mips_GOTTPREL) + .Case("tprel_hi", MCSymbolRefExpr::VK_Mips_TPREL_HI) + .Case("tprel_lo", MCSymbolRefExpr::VK_Mips_TPREL_LO) + .Case("got_disp", MCSymbolRefExpr::VK_Mips_GOT_DISP) + .Case("got_page", MCSymbolRefExpr::VK_Mips_GOT_PAGE) + .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST) + .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI) + .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO) + .Default(MCSymbolRefExpr::VK_None); + + return VK; +} + +static int ConvertCcString(StringRef CondString) { + int CC = StringSwitch<unsigned>(CondString) + .Case(".f", 0) + .Case(".un", 1) + .Case(".eq", 2) + .Case(".ueq", 3) + .Case(".olt", 4) + .Case(".ult", 5) + .Case(".ole", 6) + .Case(".ule", 7) + .Case(".sf", 8) + .Case(".ngle", 9) + .Case(".seq", 10) + .Case(".ngl", 11) + .Case(".lt", 12) + .Case(".nge", 13) + .Case(".le", 14) + .Case(".ngt", 15) + .Default(-1); + + return CC; +} + +bool MipsAsmParser:: +parseMathOperation(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + //split the format + size_t Start = Name.find('.'), Next = Name.rfind('.'); + StringRef Format1 = Name.slice(Start, Next); + //and add the first format to the operands + Operands.push_back(MipsOperand::CreateToken(Format1, NameLoc)); + //now for the second format + StringRef Format2 = Name.slice(Next, StringRef::npos); + Operands.push_back(MipsOperand::CreateToken(Format2, NameLoc)); + + //set the format for the first register + setFpFormat(Format1); + + // Read the remaining operands. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + // Read the first operand. + if (ParseOperand(Operands, Name)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + if (getLexer().isNot(AsmToken::Comma)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + + } + Parser.Lex(); // Eat the comma. + + //set the format for the first register + setFpFormat(Format2); + + // Parse and remember the operand. + if (ParseOperand(Operands, Name)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement + return false; +} + bool MipsAsmParser:: ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands) { - return true; + //floating point instructions: should register be treated as double? + if (requestsDoubleOperand(Name)) { + setFpFormat(FP_FORMAT_D); + Operands.push_back(MipsOperand::CreateToken(Name, NameLoc)); + } + else { + setDefaultFpFormat(); + // Create the leading tokens for the mnemonic, split by '.' characters. + size_t Start = 0, Next = Name.find('.'); + StringRef Mnemonic = Name.slice(Start, Next); + + Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc)); + + if (Next != StringRef::npos) { + //there is a format token in mnemonic + //StringRef Rest = Name.slice(Next, StringRef::npos); + size_t Dot = Name.find('.', Next+1); + StringRef Format = Name.slice(Next, Dot); + if (Dot == StringRef::npos) //only one '.' in a string, it's a format + Operands.push_back(MipsOperand::CreateToken(Format, NameLoc)); + else { + if (Name.startswith("c.")){ + // floating point compare, add '.' and immediate represent for cc + Operands.push_back(MipsOperand::CreateToken(".", NameLoc)); + int Cc = ConvertCcString(Format); + if (Cc == -1) { + return Error(NameLoc, "Invalid conditional code"); + } + SMLoc E = SMLoc::getFromPointer( + Parser.getTok().getLoc().getPointer() -1 ); + Operands.push_back(MipsOperand::CreateImm( + MCConstantExpr::Create(Cc, getContext()), NameLoc, E)); + } else { + //trunc, ceil, floor ... + return parseMathOperation(Name, NameLoc, Operands); + } + + //the rest is a format + Format = Name.slice(Dot, StringRef::npos); + Operands.push_back(MipsOperand::CreateToken(Format, NameLoc)); + } + + setFpFormat(Format); + } + } + + // Read the remaining operands. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + // Read the first operand. + if (ParseOperand(Operands, Name)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + while (getLexer().is(AsmToken::Comma) ) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (ParseOperand(Operands, Name)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement + return false; } bool MipsAsmParser:: ParseDirective(AsmToken DirectiveID) { - return true; -} -MipsAsmParser::OperandMatchResultTy MipsAsmParser:: - parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&) { - return MatchOperand_ParseFail; + if (DirectiveID.getString() == ".ent") { + //ignore this directive for now + Parser.Lex(); + return false; + } + + if (DirectiveID.getString() == ".end") { + //ignore this directive for now + Parser.Lex(); + return false; + } + + if (DirectiveID.getString() == ".frame") { + //ignore this directive for now + Parser.EatToEndOfStatement(); + return false; + } + + if (DirectiveID.getString() == ".set") { + //ignore this directive for now + Parser.EatToEndOfStatement(); + return false; + } + + if (DirectiveID.getString() == ".fmask") { + //ignore this directive for now + Parser.EatToEndOfStatement(); + return false; + } + + if (DirectiveID.getString() == ".mask") { + //ignore this directive for now + Parser.EatToEndOfStatement(); + return false; + } + + if (DirectiveID.getString() == ".gpword") { + //ignore this directive for now + Parser.EatToEndOfStatement(); + return false; + } + + return true; } extern "C" void LLVMInitializeMipsAsmParser() { @@ -130,3 +945,7 @@ extern "C" void LLVMInitializeMipsAsmParser() { RegisterMCAsmParser<MipsAsmParser> A(TheMips64Target); RegisterMCAsmParser<MipsAsmParser> B(TheMips64elTarget); } + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "MipsGenAsmMatcher.inc" diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt index f535c50..0f84358 100644 --- a/lib/Target/Mips/CMakeLists.txt +++ b/lib/Target/Mips/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_target(MipsCodeGen MipsAsmPrinter.cpp MipsCodeEmitter.cpp MipsDelaySlotFiller.cpp + MipsDirectObjLower.cpp MipsELFWriterInfo.cpp MipsJITInfo.cpp MipsInstrInfo.cpp diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h index 234455e..9603327 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h +++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h @@ -122,7 +122,7 @@ inline static unsigned getMipsRegisterNumbering(unsigned RegEnum) { switch (RegEnum) { case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64: - case Mips::D0: + case Mips::D0: case Mips::FCC0: return 0; case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64: return 1; diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index b8489ca..5d240fe 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -56,7 +56,7 @@ namespace { MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64, bool IsLittleEndian) : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS, - /*HasRelocationAddend*/ false, + /*HasRelocationAddend*/ (_isN64) ? true : false, /*IsN64*/ _isN64) {} MipsELFObjectWriter::~MipsELFObjectWriter() {} diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 8dab62d..1d7370a 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -143,7 +143,11 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups) const { const MCOperand &MO = MI.getOperand(OpNo); - assert(MO.isExpr() && "getBranchTargetOpValue expects only expressions"); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) return MO.getImm(); + assert(MO.isExpr() && + "getBranchTargetOpValue expects only expressions or immediates"); const MCExpr *Expr = MO.getExpr(); Fixups.push_back(MCFixup::Create(0, Expr, @@ -159,7 +163,10 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups) const { const MCOperand &MO = MI.getOperand(OpNo); - assert(MO.isExpr() && "getJumpTargetOpValue expects only expressions"); + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) return MO.getImm(); + assert(MO.isExpr() && + "getJumpTargetOpValue expects only expressions or an immediate"); const MCExpr *Expr = MO.getExpr(); Fixups.push_back(MCFixup::Create(0, Expr, diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp index 2bc286b..ec84ad8 100644 --- a/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/lib/Target/Mips/Mips16InstrInfo.cpp @@ -26,7 +26,7 @@ using namespace llvm; Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm) : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0), - RI(*tm.getSubtargetImpl(), *this) {} + RI(*tm.getSubtargetImpl()) {} const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { return RI; diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp index c15d1bf..106e82f 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.cpp +++ b/lib/Target/Mips/Mips16RegisterInfo.cpp @@ -38,9 +38,8 @@ using namespace llvm; -Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST, - const TargetInstrInfo &TII) - : MipsRegisterInfo(ST, TII) {} +Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST) + : MipsRegisterInfo(ST) {} // This function eliminate ADJCALLSTACKDOWN, // ADJCALLSTACKUP pseudo instructions diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h index 3f4b3a7..c702a15 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.h +++ b/lib/Target/Mips/Mips16RegisterInfo.h @@ -17,11 +17,11 @@ #include "MipsRegisterInfo.h" namespace llvm { +class Mips16InstrInfo; class Mips16RegisterInfo : public MipsRegisterInfo { public: - Mips16RegisterInfo(const MipsSubtarget &Subtarget, - const TargetInstrInfo &TII); + Mips16RegisterInfo(const MipsSubtarget &Subtarget); void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 20fc178..147be5d 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -110,9 +110,9 @@ def DSLLV : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>; def DSRLV : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>; def DSRAV : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>; let Pattern = []<dag> in { -def DSLL32 : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>; -def DSRL32 : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>; -def DSRA32 : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>; + def DSLL32 : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>; + def DSRL32 : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>; + def DSRA32 : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>; } } // Rotate Instructions @@ -217,7 +217,15 @@ let DecoderNamespace = "Mips64" in { def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>; def DEXT : ExtBase<3, "dext", CPU64Regs>; +let Pattern = []<dag> in { + def DEXTU : ExtBase<2, "dextu", CPU64Regs>; + def DEXTM : ExtBase<1, "dextm", CPU64Regs>; +} def DINS : InsBase<7, "dins", CPU64Regs>; +let Pattern = []<dag> in { + def DINSU : InsBase<6, "dinsu", CPU64Regs>; + def DINSM : InsBase<5, "dinsm", CPU64Regs>; +} let isCodeGenOnly = 1, rs = 0, shamt = 0 in { def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt), diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp index dc8fbd0..99b163e 100644 --- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp +++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp @@ -91,7 +91,7 @@ void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) { // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit. int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd); - int64_t ShiftedImm = Imm << (Seq[1].ImmOpnd - 16); + int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16); if (!isInt<16>(ShiftedImm)) return; diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 00ff754..e780134 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "mips-asm-printer" #include "Mips.h" #include "MipsAsmPrinter.h" +#include "MipsDirectObjLower.h" #include "MipsInstrInfo.h" #include "MipsMCInstLower.h" #include "InstPrinter/MipsInstPrinter.h" @@ -58,33 +59,31 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - // Direct object specific instruction lowering - if (!OutStreamer.hasRawTextSupport()) - switch (MI->getOpcode()) { - case Mips::DSLL: - case Mips::DSRL: - case Mips::DSRA: - assert(MI->getNumOperands() == 3 && - "Invalid no. of machine operands for shift!"); - assert(MI->getOperand(2).isImm()); - int64_t Shift = MI->getOperand(2).getImm(); - if (Shift > 31) { - MCInst TmpInst0; - MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32); - OutStreamer.EmitInstruction(TmpInst0); - return; - } - break; - } - MachineBasicBlock::const_instr_iterator I = MI; MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); do { MCInst TmpInst0; MCInstLowering.Lower(I++, TmpInst0); + + // Direct object specific instruction lowering + if (!OutStreamer.hasRawTextSupport()){ + switch (TmpInst0.getOpcode()) { + // If shift amount is >= 32 it the inst needs to be lowered further + case Mips::DSLL: + case Mips::DSRL: + case Mips::DSRA: + Mips::LowerLargeShift(TmpInst0); + break; + // Double extract instruction is chosen by pos and size operands + case Mips::DEXT: + case Mips::DINS: + Mips::LowerDextDins(TmpInst0); + } + } + OutStreamer.EmitInstruction(TmpInst0); - } while ((I != E) && I->isInsideBundle()); + } while ((I != E) && I->isInsideBundle()); // Delay slot check } //===----------------------------------------------------------------------===// @@ -214,7 +213,7 @@ const char *MipsAsmPrinter::getCurrentABIString() const { case MipsSubtarget::N32: return "abiN32"; case MipsSubtarget::N64: return "abi64"; case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64 - default: llvm_unreachable("Unknown Mips ABI");; + default: llvm_unreachable("Unknown Mips ABI"); } } diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp index cb7022b..5433295 100644 --- a/lib/Target/Mips/MipsCodeEmitter.cpp +++ b/lib/Target/Mips/MipsCodeEmitter.cpp @@ -30,7 +30,6 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" -#include "llvm/Function.h" #include "llvm/PassManager.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -139,7 +138,7 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) { do { DEBUG(errs() << "JITTing function '" - << MF.getFunction()->getName() << "'\n"); + << MF.getName() << "'\n"); MCE.startFunction(MF); for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index 2bba8a3..e3c8ed7 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -30,10 +30,11 @@ STATISTIC(FilledSlots, "Number of delay slots filled"); STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that" " are not NOP."); -static cl::opt<bool> EnableDelaySlotFiller( - "enable-mips-delay-filler", +static cl::opt<bool> DisableDelaySlotFiller( + "disable-mips-delay-filler", cl::init(false), - cl::desc("Fill the Mips delay slots useful instructions."), + cl::desc("Disable the delay slot filler, which attempts to fill the Mips" + "delay slots with useful instructions."), cl::Hidden); // This option can be used to silence complaints by machine verifier passes. @@ -114,7 +115,9 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) { InstrIter D; - if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) { + // Delay slot filling is disabled at -O0. + if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) && + findDelayInstr(MBB, I, D)) { MBB.splice(llvm::next(I), &MBB, D); ++UsefulSlots; } else diff --git a/lib/Target/Mips/MipsDirectObjLower.cpp b/lib/Target/Mips/MipsDirectObjLower.cpp new file mode 100644 index 0000000..0d74db8 --- /dev/null +++ b/lib/Target/Mips/MipsDirectObjLower.cpp @@ -0,0 +1,86 @@ +//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower Mips MCInst records that are normally +// left to the assembler to lower such as large shifts. +// +//===----------------------------------------------------------------------===// +#include "MipsDirectObjLower.h" +#include "MipsInstrInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" + +using namespace llvm; + +// If the D<shift> instruction has a shift amount that is greater +// than 31 (checked in calling routine), lower it to a D<shift>32 instruction +void Mips::LowerLargeShift(MCInst& Inst) { + + assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!"); + assert(Inst.getOperand(2).isImm()); + + bool isLarge = false; + int64_t Shift; + Shift = Inst.getOperand(2).getImm(); + if (Shift > 31) { + Shift -= 32; + isLarge = true; + } + + // saminus32 + (Inst.getOperand(2)).setImm(Shift); + + if (isLarge) + switch (Inst.getOpcode()) { + default: + // Calling function is not synchronized + llvm_unreachable("Unexpected shift instruction"); + case Mips::DSLL: + Inst.setOpcode(Mips::DSLL32); + return; + case Mips::DSRL: + Inst.setOpcode(Mips::DSRL32); + return; + case Mips::DSRA: + Inst.setOpcode(Mips::DSRA32); + return; + } +} + +// Pick a DEXT or DINS instruction variant based on the pos and size operands +void Mips::LowerDextDins(MCInst& InstIn) { + int Opcode = InstIn.getOpcode(); + + if (Opcode == Mips::DEXT) + assert(InstIn.getNumOperands() == 4 && + "Invalid no. of machine operands for DEXT!"); + else // Only DEXT and DINS are possible + assert(InstIn.getNumOperands() == 5 && + "Invalid no. of machine operands for DINS!"); + + assert(InstIn.getOperand(2).isImm()); + int64_t pos = InstIn.getOperand(2).getImm(); + assert(InstIn.getOperand(3).isImm()); + int64_t size = InstIn.getOperand(3).getImm(); + + if (size <= 32) { + if ((pos < 32)) { // DEXT/DINS, do nothing + return; + } else { // DEXTU/DINSU + InstIn.getOperand(2).setImm(pos - 32); + InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU); + return; + } + } else { // DEXTM/DINSM + assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32"); + InstIn.getOperand(3).setImm(size - 32); + InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM); + return; + } +} diff --git a/lib/Target/Mips/MipsDirectObjLower.h b/lib/Target/Mips/MipsDirectObjLower.h new file mode 100644 index 0000000..8813cc9 --- /dev/null +++ b/lib/Target/Mips/MipsDirectObjLower.h @@ -0,0 +1,28 @@ +//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSDIRECTOBJLOWER_H +#define MIPSDIRECTOBJLOWER_H +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { + class MCInst; + class MCStreamer; + + namespace Mips { + /// MipsDirectObjLower - This name space is used to lower MCInstr in cases + // where the assembler usually finishes the lowering + // such as large shifts. + void LowerLargeShift(MCInst &Inst); + void LowerDextDins(MCInst &Inst); + } +} + +#endif diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 5a97c17..4205223 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -337,8 +337,9 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) { // Generate: // lui $2, %hi($CPI1_0) // lwc1 $f0, %lo($CPI1_0)($2) - if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) { - SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0); + if (Addr.getOperand(1).getOpcode() == MipsISD::Lo || + Addr.getOperand(1).getOpcode() == MipsISD::GPRel) { + SDValue Opnd0 = Addr.getOperand(1).getOperand(0); if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) || isa<JumpTableSDNode>(Opnd0)) { Base = Addr.getOperand(0); diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index c5207c6..aa7b459 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -1571,15 +1571,15 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) { SDVTList VTs = DAG.getVTList(MVT::i32); - MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering(); + const MipsTargetObjectFile &TLOF = (const MipsTargetObjectFile&)getObjFileLowering(); // %gp_rel relocation if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) { SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, MipsII::MO_GPREL); SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1); - SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32); - return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode); + SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32); + return DAG.getNode(ISD::ADD, dl, MVT::i32, GPReg, GPRelNode); } // %hi/%lo relocation SDValue GAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index 50e3eb5..8ade891 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -262,46 +262,3 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { } } } - -unsigned -llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII, - MachineBasicBlock& MBB, - MachineBasicBlock::iterator II, DebugLoc DL, - bool LastInstrIsADDiu, - MipsAnalyzeImmediate::Inst *LastInst) { - MipsAnalyzeImmediate AnalyzeImm; - unsigned Size = IsN64 ? 64 : 32; - unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi; - unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO; - unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT; - - const MipsAnalyzeImmediate::InstSeq &Seq = - AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu); - MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin(); - - if (LastInst && (Seq.size() == 1)) { - *LastInst = *Inst; - return 0; - } - - // The first instruction can be a LUi, which is different from other - // instructions (ADDiu, ORI and SLL) in that it does not have a register - // operand. - if (Inst->Opc == LUi) - BuildMI(MBB, II, DL, TII.get(LUi), ATReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - else - BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - - // Build the remaining instructions in Seq. Skip the last instruction if - // LastInst is not 0. - for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst) - BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - - if (LastInst) - *LastInst = *Inst; - - return Seq.size() - !!LastInst; -} diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index 7d56259..aca2bc7 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -88,18 +88,6 @@ private: const SmallVectorImpl<MachineOperand>& Cond) const; }; -namespace Mips { - /// Emit a series of instructions to load an immediate. All instructions - /// except for the last one are emitted. The function returns the number of - /// MachineInstrs generated. The opcode-immediate pair of the last - /// instruction is returned in LastInst, if it is not 0. - unsigned - loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII, - MachineBasicBlock& MBB, MachineBasicBlock::iterator II, - DebugLoc DL, bool LastInstrIsADDiu, - MipsAnalyzeImmediate::Inst *LastInst); -} - /// Create MipsInstrInfo objects. const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM); const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM); diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index fd952ef..6b6005f 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -74,9 +74,10 @@ def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; // These are target-independent nodes, but have target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart, - [SDNPHasChain, SDNPOutGlue]>; + [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + [SDNPHasChain, SDNPSideEffect, + SDNPOptInGlue, SDNPOutGlue]>; // MAdd*/MSub* nodes def MipsMAdd : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub, @@ -110,7 +111,7 @@ def MipsWrapper : SDNode<"MipsISD::Wrapper", SDTIntBinOp>; def MipsDynAlloc : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc, [SDNPHasChain, SDNPInGlue]>; -def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>; +def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>; def MipsExt : SDNode<"MipsISD::Ext", SDT_Ext>; def MipsIns : SDNode<"MipsISD::Ins", SDT_Ins>; @@ -1079,6 +1080,26 @@ def EXT : ExtBase<0, "ext", CPURegs>; def INS : InsBase<4, "ins", CPURegs>; //===----------------------------------------------------------------------===// +// Instruction aliases +//===----------------------------------------------------------------------===// +def : InstAlias<"move $dst,$src", (ADD CPURegs:$dst,CPURegs:$src,ZERO)>; +def : InstAlias<"bal $offset", (BGEZAL RA,brtarget:$offset)>; +def : InstAlias<"addu $rs,$rt,$imm", + (ADDiu CPURegs:$rs,CPURegs:$rt,simm16:$imm)>; +def : InstAlias<"add $rs,$rt,$imm", + (ADDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>; +def : InstAlias<"and $rs,$rt,$imm", + (ANDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>; +def : InstAlias<"j $rs", (JR CPURegs:$rs)>; +def : InstAlias<"not $rt,$rs", (NOR CPURegs:$rt,CPURegs:$rs,ZERO)>; +def : InstAlias<"neg $rt,$rs", (SUB CPURegs:$rt,ZERO,CPURegs:$rs)>; +def : InstAlias<"negu $rt,$rs", (SUBu CPURegs:$rt,ZERO,CPURegs:$rs)>; +def : InstAlias<"slt $rs,$rt,$imm", + (SLTi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>; +def : InstAlias<"xor $rs,$rt,$imm", + (XORi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>; + +//===----------------------------------------------------------------------===// // Arbitrary patterns that map to one or more instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index f78203f..b9dbd52 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -10,6 +10,10 @@ // This pass expands a branch or jump instruction into a long branch if its // offset is too large to fit into its immediate field. // +// FIXME: +// 1. Fix pc-region jump instructions which cross 256MB segment boundaries. +// 2. If program has inline assembly statements whose size cannot be +// determined accurately, load branch target addresses from the GOT. //===----------------------------------------------------------------------===// #define DEBUG_TYPE "mips-long-branch" @@ -48,7 +52,7 @@ namespace { typedef MachineBasicBlock::reverse_iterator ReverseIter; struct MBBInfo { - uint64_t Size; + uint64_t Size, Address; bool HasLongBranch; MachineInstr *Br; @@ -61,7 +65,10 @@ namespace { static char ID; MipsLongBranch(TargetMachine &tm) : MachineFunctionPass(ID), TM(tm), - TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {} + TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())), + IsPIC(TM.getRelocationModel() == Reloc::PIC_), + ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()), + LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {} virtual const char *getPassName() const { return "Mips Long Branch"; @@ -81,6 +88,9 @@ namespace { const MipsInstrInfo *TII; MachineFunction *MF; SmallVector<MBBInfo, 16> MBBInfos; + bool IsPIC; + unsigned ABI; + unsigned LongBranchSeqSize; }; char MipsLongBranch::ID = 0; @@ -230,12 +240,6 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br, // Expand branch instructions to long branches. void MipsLongBranch::expandToLongBranch(MBBInfo &I) { - I.HasLongBranch = true; - - bool IsPIC = TM.getRelocationModel() == Reloc::PIC_; - unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI(); - bool N64 = ABI == MipsSubtarget::N64; - MachineBasicBlock::iterator Pos; MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br); DebugLoc DL = I.Br->getDebugLoc(); @@ -248,101 +252,105 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { MBB->addSuccessor(LongBrMBB); if (IsPIC) { - // $longbr: - // addiu $sp, $sp, -regsize * 2 - // sw $ra, 0($sp) - // bal $baltgt - // sw $a3, regsize($sp) - // $baltgt: - // lui $a3, %hi($baltgt) - // lui $at, %hi($tgt) - // addiu $a3, $a3, %lo($baltgt) - // addiu $at, $at, %lo($tgt) - // subu $at, $at, $a3 - // addu $at, $ra, $at - // - // if n64: - // lui $a3, %highest($baltgt) - // lui $ra, %highest($tgt) - // addiu $a3, $a3, %higher($baltgt) - // addiu $ra, $ra, %higher($tgt) - // dsll $a3, $a3, 32 - // dsll $ra, $ra, 32 - // subu $at, $at, $a3 - // addu $at, $at, $ra - // - // lw $ra, 0($sp) - // lw $a3, regsize($sp) - // jr $at - // addiu $sp, $sp, regsize * 2 - // $fallthrough: - // - MF->getInfo<MipsFunctionInfo>()->setEmitNOAT(); MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); MF->insert(FallThroughMBB, BalTgtMBB); LongBrMBB->addSuccessor(BalTgtMBB); BalTgtMBB->addSuccessor(TgtMBB); - int RegSize = N64 ? 8 : 4; - unsigned AT = N64 ? Mips::AT_64 : Mips::AT; - unsigned A3 = N64 ? Mips::A3_64 : Mips::A3; - unsigned SP = N64 ? Mips::SP_64 : Mips::SP; - unsigned RA = N64 ? Mips::RA_64 : Mips::RA; - unsigned Load = N64 ? Mips::LD_P8 : Mips::LW; - unsigned Store = N64 ? Mips::SD_P8 : Mips::SW; - unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi; - unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu; - unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu; - unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu; - unsigned JR = N64 ? Mips::JR64 : Mips::JR; - - Pos = LongBrMBB->begin(); - - BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP) - .addImm(-RegSize * 2); - BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP) - .addImm(0); - BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB); - BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP) - .addImm(RegSize)->setIsInsideBundle(); - - Pos = BalTgtMBB->begin(); - - BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3) - .addMBB(BalTgtMBB, MipsII::MO_ABS_HI); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT) - .addMBB(TgtMBB, MipsII::MO_ABS_HI); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3) - .addMBB(BalTgtMBB, MipsII::MO_ABS_LO); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT) - .addMBB(TgtMBB, MipsII::MO_ABS_LO); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT); - - if (N64) { - BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3) - .addMBB(BalTgtMBB, MipsII::MO_HIGHEST); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA) - .addMBB(TgtMBB, MipsII::MO_HIGHEST); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3) - .addMBB(BalTgtMBB, MipsII::MO_HIGHER); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA) - .addMBB(TgtMBB, MipsII::MO_HIGHER); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3) - .addImm(32); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA) - .addImm(32); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA); - I.Size += 4 * 8; + int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address; + int64_t Offset = TgtAddress - (I.Address + I.Size - 20); + int64_t Lo = SignExtend64<16>(Offset & 0xffff); + int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff); + + if (ABI != MipsSubtarget::N64) { + // $longbr: + // addiu $sp, $sp, -8 + // sw $ra, 0($sp) + // bal $baltgt + // lui $at, %hi($tgt - $baltgt) + // $baltgt: + // addiu $at, $at, %lo($tgt - $baltgt) + // addu $at, $ra, $at + // lw $ra, 0($sp) + // jr $at + // addiu $sp, $sp, 8 + // $fallthrough: + // + + Pos = LongBrMBB->begin(); + + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP).addImm(-8); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA) + .addReg(Mips::SP).addImm(0); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi) + ->setIsInsideBundle(); + + Pos = BalTgtMBB->begin(); + + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT) + .addReg(Mips::AT).addImm(Lo); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT) + .addReg(Mips::RA).addReg(Mips::AT); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA) + .addReg(Mips::SP).addImm(0); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP).addImm(8)->setIsInsideBundle(); + } else { + // $longbr: + // daddiu $sp, $sp, -16 + // sd $ra, 0($sp) + // lui64 $at, %highest($tgt - $baltgt) + // daddiu $at, $at, %higher($tgt - $baltgt) + // dsll $at, $at, 16 + // daddiu $at, $at, %hi($tgt - $baltgt) + // bal $baltgt + // dsll $at, $at, 16 + // $baltgt: + // daddiu $at, $at, %lo($tgt - $baltgt) + // daddu $at, $ra, $at + // ld $ra, 0($sp) + // jr64 $at + // daddiu $sp, $sp, 16 + // $fallthrough: + // + + int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff); + int64_t Highest = + SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff); + + Pos = LongBrMBB->begin(); + + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64) + .addReg(Mips::SP_64).addImm(-16); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64) + .addReg(Mips::SP_64).addImm(0); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64) + .addImm(Highest); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64) + .addReg(Mips::AT_64).addImm(Higher); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64) + .addReg(Mips::AT_64).addImm(16); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64) + .addReg(Mips::AT_64).addImm(Hi); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64) + .addReg(Mips::AT_64).addImm(16)->setIsInsideBundle(); + + Pos = BalTgtMBB->begin(); + + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64) + .addReg(Mips::AT_64).addImm(Lo); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64) + .addReg(Mips::RA_64).addReg(Mips::AT_64); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64) + .addReg(Mips::SP_64).addImm(0); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64) + .addReg(Mips::SP_64).addImm(16)->setIsInsideBundle(); } - - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT); - BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP) - .addImm(RegSize * 2)->setIsInsideBundle(); - I.Size += 4 * 14; } else { // $longbr: // j $tgt @@ -353,7 +361,6 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { LongBrMBB->addSuccessor(TgtMBB); BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB); BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle(); - I.Size += 4 * 2; } if (I.Br->isUnconditionalBranch()) { @@ -401,19 +408,36 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) { if (!I->Br || I->HasLongBranch) continue; - if (!ForceLongBranch) - // Check if offset fits into 16-bit immediate field of branches. - if (isInt<16>(computeOffset(I->Br) / 4)) - continue; + // Check if offset fits into 16-bit immediate field of branches. + if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4)) + continue; - expandToLongBranch(*I); + I->HasLongBranch = true; + I->Size += LongBranchSeqSize * 4; ++LongBranches; EverMadeChange = MadeChange = true; } } - if (EverMadeChange) - MF->RenumberBlocks(); + if (!EverMadeChange) + return true; + + // Compute basic block addresses. + if (TM.getRelocationModel() == Reloc::PIC_) { + MF->getInfo<MipsFunctionInfo>()->setEmitNOAT(); + + uint64_t Address = 0; + + for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I) + I->Address = Address; + } + + // Do the expansion. + for (I = MBBInfos.begin(); I != E; ++I) + if (I->HasLongBranch) + expandToLongBranch(*I); + + MF->RenumberBlocks(); return true; } diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp index d4c5e6d..5fa6339 100644 --- a/lib/Target/Mips/MipsMCInstLower.cpp +++ b/lib/Target/Mips/MipsMCInstLower.cpp @@ -11,7 +11,6 @@ // MCInst records. // //===----------------------------------------------------------------------===// - #include "MipsMCInstLower.h" #include "MipsAsmPrinter.h" #include "MipsInstrInfo.h" @@ -161,31 +160,3 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { } } -// If the D<shift> instruction has a shift amount that is greater -// than 31 (checked in calling routine), lower it to a D<shift>32 instruction -void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI, - MCInst& Inst, - int64_t Shift) { - // rt - Inst.addOperand(LowerOperand(MI->getOperand(0))); - // rd - Inst.addOperand(LowerOperand(MI->getOperand(1))); - // saminus32 - Inst.addOperand(MCOperand::CreateImm(Shift)); - - switch (MI->getOpcode()) { - default: - // Calling function is not synchronized - llvm_unreachable("Unexpected shift instruction"); - break; - case Mips::DSLL: - Inst.setOpcode(Mips::DSLL32); - break; - case Mips::DSRL: - Inst.setOpcode(Mips::DSRL32); - break; - case Mips::DSRA: - Inst.setOpcode(Mips::DSRA32); - break; - } -} diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h index 0abb996..3eab5a4 100644 --- a/lib/Target/Mips/MipsMCInstLower.h +++ b/lib/Target/Mips/MipsMCInstLower.h @@ -33,7 +33,6 @@ public: MipsMCInstLower(MipsAsmPrinter &asmprinter); void Initialize(Mangler *mang, MCContext *C); void Lower(const MachineInstr *MI, MCInst &OutMI) const; - void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift); private: MCOperand LowerSymbolOperand(const MachineOperand &MO, diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index ae6ae3a..79a142a 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -22,7 +22,6 @@ #include "llvm/Constants.h" #include "llvm/DebugInfo.h" #include "llvm/Type.h" -#include "llvm/Function.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFunction.h" @@ -43,9 +42,8 @@ using namespace llvm; -MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, - const TargetInstrInfo &tii) - : MipsGenRegisterInfo(Mips::RA), Subtarget(ST), TII(tii) {} +MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST) + : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {} unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; } @@ -131,6 +129,12 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::RA_64); } + // Reserve GP if small section is used. + if (Subtarget.useSmallSection()) { + Reserved.set(Mips::GP); + Reserved.set(Mips::GP_64); + } + return Reserved; } @@ -160,7 +164,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, "Instr doesn't have FrameIndex operand!"); } - DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n"; + DEBUG(errs() << "\nFunction : " << MF.getName() << "\n"; errs() << "<--------->\n" << MI); int FrameIndex = MI.getOperand(i).getIndex(); diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h index 9a05e94..78adf7f 100644 --- a/lib/Target/Mips/MipsRegisterInfo.h +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -22,16 +22,14 @@ namespace llvm { class MipsSubtarget; -class TargetInstrInfo; class Type; class MipsRegisterInfo : public MipsGenRegisterInfo { protected: const MipsSubtarget &Subtarget; - const TargetInstrInfo &TII; public: - MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii); + MipsRegisterInfo(const MipsSubtarget &Subtarget); /// getRegisterNumbering - Given the enum value for some register, e.g. /// Mips::RA, return the number that it corresponds to (e.g. 31). diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index eeb1de3..e4b44ef 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -260,14 +260,53 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, if (isInt<16>(Amount))// addi sp, sp, amount BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount); else { // Expand immediate that doesn't fit in 16-bit. - unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT; - MBB.getParent()->getInfo<MipsFunctionInfo>()->setEmitNOAT(); - Mips::loadImmediate(Amount, STI.isABI_N64(), *this, MBB, I, DL, false, 0); - BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(ATReg); + unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0); + BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg); } } +/// This function generates the sequence of instructions needed to get the +/// result of adding register REG and immediate IMM. +unsigned +MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB, + MachineBasicBlock::iterator II, DebugLoc DL, + unsigned *NewImm) const { + MipsAnalyzeImmediate AnalyzeImm; + const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>(); + unsigned Size = STI.isABI_N64() ? 64 : 32; + unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi; + unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO; + unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT; + bool LastInstrIsADDiu = NewImm; + + const MipsAnalyzeImmediate::InstSeq &Seq = + AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu); + MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin(); + + assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1))); + + // The first instruction can be a LUi, which is different from other + // instructions (ADDiu, ORI and SLL) in that it does not have a register + // operand. + if (Inst->Opc == LUi) + BuildMI(MBB, II, DL, get(LUi), ATReg) + .addImm(SignExtend64<16>(Inst->ImmOpnd)); + else + BuildMI(MBB, II, DL, get(Inst->Opc), ATReg).addReg(ZEROReg) + .addImm(SignExtend64<16>(Inst->ImmOpnd)); + + // Build the remaining instructions in Seq. + for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst) + BuildMI(MBB, II, DL, get(Inst->Opc), ATReg).addReg(ATReg) + .addImm(SignExtend64<16>(Inst->ImmOpnd)); + + if (LastInstrIsADDiu) + *NewImm = Inst->ImmOpnd; + + return ATReg; +} + unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const { return (Opc == Mips::BEQ || Opc == Mips::BNE || Opc == Mips::BGTZ || Opc == Mips::BGEZ || Opc == Mips::BLTZ || Opc == Mips::BLEZ || diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h index 346e74d..55b78b2 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.h +++ b/lib/Target/Mips/MipsSEInstrInfo.h @@ -15,7 +15,6 @@ #define MIPSSEINSTRUCTIONINFO_H #include "MipsInstrInfo.h" -#include "MipsAnalyzeImmediate.h" #include "MipsSERegisterInfo.h" namespace llvm { @@ -70,6 +69,13 @@ public: void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + /// Emit a series of instructions to load an immediate. If NewImm is a + /// non-NULL parameter, the last instruction is not emitted, but instead + /// its immediate operand is returned in NewImm. + unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB, + MachineBasicBlock::iterator II, DebugLoc DL, + unsigned *NewImm) const; + private: virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const; diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp index 043a1ef..d868f73 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -40,8 +40,8 @@ using namespace llvm; MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST, - const TargetInstrInfo &TII) - : MipsRegisterInfo(ST, TII) {} + const MipsSEInstrInfo &I) + : MipsRegisterInfo(ST), TII(I) {} // This function eliminate ADJCALLSTACKDOWN, // ADJCALLSTACKUP pseudo instructions @@ -122,15 +122,14 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, DebugLoc DL = II->getDebugLoc(); unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu; unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT; - MipsAnalyzeImmediate::Inst LastInst(0, 0); + unsigned NewImm; MipsFI->setEmitNOAT(); - Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true, - &LastInst); - BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg); + unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm); + BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(Reg); FrameReg = ATReg; - Offset = SignExtend64<16>(LastInst.ImmOpnd); + Offset = SignExtend64<16>(NewImm); } MI.getOperand(OpNo).ChangeToRegister(FrameReg, false); diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h index 4b17b33..b4eab65 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.h +++ b/lib/Target/Mips/MipsSERegisterInfo.h @@ -18,11 +18,14 @@ #include "MipsRegisterInfo.h" namespace llvm { +class MipsSEInstrInfo; class MipsSERegisterInfo : public MipsRegisterInfo { + const MipsSEInstrInfo &TII; + public: MipsSERegisterInfo(const MipsSubtarget &Subtarget, - const TargetInstrInfo &TII); + const MipsSEInstrInfo &TII); void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 11ff809..ac83d83 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -25,7 +25,8 @@ using namespace llvm; void MipsSubtarget::anchor() { } MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool little) : + const std::string &FS, bool little, + Reloc::Model RM) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false), @@ -54,6 +55,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, // Is the target system Linux ? if (TT.find("linux") == std::string::npos) IsLinux = false; + + // Set UseSmallSection. + UseSmallSection = !IsLinux && (RM == Reloc::Static); } bool diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index ba15362..0595e8d 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -65,6 +65,9 @@ protected: // isLinux - Target system is Linux. Is false we consider ELFOS for now. bool IsLinux; + // UseSmallSection - Small section is used. + bool UseSmallSection; + /// Features related to the presence of specific instructions. // HasSEInReg - SEB and SEH (signext in register) instructions. @@ -109,7 +112,7 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. MipsSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool little); + const std::string &FS, bool little, Reloc::Model RM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. @@ -133,6 +136,7 @@ public: bool inMips16Mode() const { return InMips16Mode; } bool isAndroid() const { return IsAndroid; } bool isLinux() const { return IsLinux; } + bool useSmallSection() const { return UseSmallSection; } bool hasStandardEncoding() const { return !inMips16Mode(); } diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 2928a73..b70542b 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -42,7 +42,7 @@ MipsTargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool isLittle) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, isLittle), + Subtarget(TT, CPU, FS, isLittle, RM), DataLayout(isLittle ? (Subtarget.isABI_N64() ? "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" : diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp index 04dc60a..1f5e34f 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -26,6 +26,7 @@ SSThreshold("mips-ssection-threshold", cl::Hidden, void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){ TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); SmallDataSection = getContext().getELFSection(".sdata", ELF::SHT_PROGBITS, @@ -60,9 +61,10 @@ bool MipsTargetObjectFile:: IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, SectionKind Kind) const { - // Only use small section for non linux targets. const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>(); - if (Subtarget.isLinux()) + + // Return if small section is not available. + if (!Subtarget.useSmallSection()) return false; // Only global variables, not functions. diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index d175e3e..413142e 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -137,7 +137,7 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { char Value = MI->getOperand(OpNo).getImm(); - Value = (Value << (32-5)) >> (32-5); + Value = SignExtend32<5>(Value); O << (int)Value; } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 245b457..b9ea8b5 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -64,7 +64,6 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) { ZeroDirective = "\t.space\t"; Data64bitsDirective = is64Bit ? "\t.quad\t" : 0; - LCOMMDirectiveType = LCOMM::NoAlignment; AssemblerDialect = 0; // Old-Style mnemonics. } diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index b7f1688..cb15dad 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -35,6 +35,10 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">; def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">; def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">; def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">; +def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective", + "PPC::DIR_E500mc", "">; +def DirectiveE5500 : SubtargetFeature<"", "DarwinDirective", + "PPC::DIR_E5500", "">; def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; @@ -94,6 +98,12 @@ def : Processor<"g5", G5Itineraries, [Directive970, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, Feature64Bit /*, Feature64BitRegs */]>; +def : ProcessorModel<"e500mc", PPCE500mcModel, + [DirectiveE500mc, FeatureMFOCRF, + FeatureSTFIWX, FeatureBookE, FeatureISEL]>; +def : ProcessorModel<"e5500", PPCE5500Model, + [DirectiveE5500, FeatureMFOCRF, Feature64Bit, + FeatureSTFIWX, FeatureBookE, FeatureISEL]>; def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE, FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, FeatureISEL, diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index f76b89c..6e0e8bb 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -109,6 +109,8 @@ namespace { bool doFinalization(Module &M); virtual void EmitFunctionEntryLabel(); + + void EmitFunctionBodyEnd(); }; /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac @@ -345,23 +347,32 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { OutStreamer.EmitLabel(PICBase); return; } + case PPC::LDtocJTI: + case PPC::LDtocCPT: case PPC::LDtoc: { // Transform %X3 = LDtoc <ga:@min1>, %X2 LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); - + // Change the opcode to LD, and the global address operand to be a // reference to the TOC entry we will synthesize later. TmpInst.setOpcode(PPC::LD); const MachineOperand &MO = MI->getOperand(1); - assert(MO.isGlobal()); - - // Map symbol -> label of TOC entry. - MCSymbol *&TOCEntry = TOC[Mang->getSymbol(MO.getGlobal())]; + + // Map symbol -> label of TOC entry + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI()); + MCSymbol *MOSymbol = 0; + if (MO.isGlobal()) + MOSymbol = Mang->getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + MCSymbol *&TOCEntry = TOC[MOSymbol]; if (TOCEntry == 0) TOCEntry = GetTempSymbol("C", TOCLabelID++); - + const MCExpr *Exp = - MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC, + MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY, OutContext); TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp); OutStreamer.EmitInstruction(TmpInst); @@ -406,9 +417,9 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName())); MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC.@tocbase")); OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext), - Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/); + 8/*size*/, 0/*addrspace*/); OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, OutContext), - Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/); + 8/*size*/, 0/*addrspace*/); OutStreamer.SwitchSection(Current); MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol( @@ -441,6 +452,23 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) { return AsmPrinter::doFinalization(M); } +/// EmitFunctionBodyEnd - Print the traceback table before the .size +/// directive. +/// +void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() { + // Only the 64-bit target requires a traceback table. For now, + // we only emit the word of zeroes that GDB requires to find + // the end of the function, and zeroes for the eight-byte + // mandatory fields. + // FIXME: We should fill in the eight-byte mandatory fields as described in + // the PPC64 ELF ABI (this is a low-priority item because GDB does not + // currently make use of these fields). + if (Subtarget.isPPC64()) { + OutStreamer.EmitIntValue(0, 4/*size*/); + OutStreamer.EmitIntValue(0, 8/*size*/); + } +} + void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { static const char *const CPUDirectives[] = { "", @@ -453,6 +481,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { "ppc750", "ppc970", "ppcA2", + "ppce500mc", + "ppce5500", "power6", "power7", "ppc64" diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index a00f686..e8f4d16 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -975,6 +975,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { case ISD::AND: { unsigned Imm, Imm2, SH, MB, ME; + uint64_t Imm64; // If this is an and of a value rotated between 0 and 31 bits and then and'd // with a mask, emit rlwinm @@ -993,6 +994,14 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) }; return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); } + // If this is a 64-bit zero-extension mask, emit rldicl. + if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && + isMask_64(Imm64)) { + SDValue Val = N->getOperand(0); + MB = 64 - CountTrailingOnes_64(Imm64); + SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB) }; + return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3); + } // AND X, 0 -> 0, not "rlwinm 32". if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { ReplaceUses(SDValue(N, 0), N->getOperand(1)); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 61d44c5..dbb3b14 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -449,6 +449,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setSchedulingPreference(Sched::Hybrid); computeRegisterProperties(); + + // The Freescale cores does better with aggressive inlining of memcpy and + // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). + if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc || + Subtarget->getDarwinDirective() == PPC::DIR_E5500) { + maxStoresPerMemset = 32; + maxStoresPerMemsetOptSize = 16; + maxStoresPerMemcpy = 32; + maxStoresPerMemcpyOptSize = 8; + maxStoresPerMemmove = 32; + maxStoresPerMemmoveOptSize = 8; + + setPrefFunctionAlignment(4); + benefitFromCodePlacementOpt = true; + } } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate @@ -517,6 +532,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; case PPCISD::MTFSF: return "PPCISD::MTFSF"; case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; + case PPCISD::CR6SET: return "PPCISD::CR6SET"; + case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; } } @@ -811,14 +828,13 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { } // Properly sign extend the value. - int ShAmt = (4-ByteSize)*8; - int MaskVal = ((int)Value << ShAmt) >> ShAmt; + int MaskVal = SignExtend32(Value, ByteSize * 8); // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. if (MaskVal == 0) return SDValue(); // Finally, if this value fits in a 5 bit sext field, return it - if (((MaskVal << (32-5)) >> (32-5)) == MaskVal) + if (SignExtend32<5>(MaskVal) == MaskVal) return DAG.getTargetConstant(MaskVal, MVT::i32); return SDValue(); } @@ -1204,6 +1220,14 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); const Constant *C = CP->getConstVal(); + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); + return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA, + DAG.getRegister(PPC::X2, MVT::i64)); + } + unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); SDValue CPIHi = @@ -1217,6 +1241,14 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); + return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA, + DAG.getRegister(PPC::X2, MVT::i64)); + } + unsigned MOHiFlag, MOLoFlag; bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); @@ -1441,7 +1473,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, MachinePointerInfo(), MVT::i32, false, false, 0); - return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), + return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), false, false, false, 0); } @@ -2408,7 +2440,7 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { int Addr = C->getZExtValue(); if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. - (Addr << 6 >> 6) != Addr) + SignExtend32<26>(Addr) != Addr) return 0; // Top 6 bits have to be sext of immediate. return DAG.getConstant((int)C->getZExtValue() >> 2, @@ -2819,6 +2851,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, isTailCall, RegsToPass, Ops, NodeTys, PPCSubTarget); + // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls + if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) + Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); + // When performing tail call optimization the callee pops its arguments off // the stack. Account for this here so these bytes can be pushed back on in // PPCRegisterInfo::eliminateCallFramePseudoInstr. @@ -3116,14 +3152,6 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); - // Set CR6 to true if this is a vararg call with floating args passed in - // registers. - if (isVarArg) { - SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET, - dl, MVT::i32), 0); - RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR)); - } - // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; @@ -3133,6 +3161,18 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, InFlag = Chain.getValue(1); } + // Set CR bit 6 to true if this is a vararg call with floating args passed in + // registers. + if (isVarArg) { + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, InFlag }; + + Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, + dl, VTs, Ops, InFlag.getNode() ? 2 : 1); + + InFlag = Chain.getValue(1); + } + if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, false, TailCallArguments); @@ -4126,7 +4166,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, unsigned TypeShiftAmt = i & (SplatBitSize-1); // vsplti + shl self. - if (SextVal == (i << (int)TypeShiftAmt)) { + if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, @@ -4171,17 +4211,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, } // t = vsplti c, result = vsldoi t, t, 1 - if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) { + if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 2 - if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) { + if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 3 - if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) { + if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl); } diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index b0a013b..902b188 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -174,6 +174,10 @@ namespace llvm { /// operand #3 optional in flag TC_RETURN, + /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls + CR6SET, + CR6UNSET, + /// STD_32 - This is the STD instruction for use with "32-bit" registers. STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 39778a5..cfe71d17 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -29,6 +29,9 @@ def symbolLo64 : Operand<i64> { let PrintMethod = "printSymbolLo"; let EncoderMethod = "getLO16Encoding"; } +def tocentry : Operand<iPTR> { + let MIOperandInfo = (ops i32imm:$imm); +} //===----------------------------------------------------------------------===// // 64-bit transformation functions. @@ -296,12 +299,14 @@ def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins), let PPC970_Unit = 1 in { // FXU Operations. +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def LI8 : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm), "li $rD, $imm", IntSimple, [(set G8RC:$rD, immSExt16:$imm)]>; def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm), "lis $rD, $imm", IntSimple, [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>; +} // Logical ops. def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), @@ -459,7 +464,7 @@ def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS), let Defs = [CARRY] in { def SRADI : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH), - "sradi $rA, $rS, $SH", IntRotateD, + "sradi $rA, $rS, $SH", IntRotateDI, [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64; } def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS), @@ -482,7 +487,7 @@ def MULLD : XOForm_1<31, 233, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB), let isCommutable = 1 in { def RLDIMI : MDForm_1<30, 3, (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB), - "rldimi $rA, $rS, $SH, $MB", IntRotateD, + "rldimi $rA, $rS, $SH, $MB", IntRotateDI, []>, isPPC64, RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; } @@ -494,11 +499,11 @@ def RLDCL : MDForm_1<30, 0, []>, isPPC64; def RLDICL : MDForm_1<30, 0, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB), - "rldicl $rA, $rS, $SH, $MB", IntRotateD, + "rldicl $rA, $rS, $SH, $MB", IntRotateDI, []>, isPPC64; def RLDICR : MDForm_1<30, 1, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME), - "rldicr $rA, $rS, $SH, $ME", IntRotateD, + "rldicr $rA, $rS, $SH, $ME", IntRotateDI, []>, isPPC64; def RLWINM8 : MForm_2<21, @@ -541,19 +546,19 @@ def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src), let mayLoad = 1 in def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp, ptr_rc:$rA), - "lhau $rD, $disp($rA)", LdStLoad, + "lhau $rD, $disp($rA)", LdStLHAU, []>, RegConstraint<"$rA = $ea_result">, NoEncode<"$ea_result">; // NO LWAU! def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lhaux $rD, $addr", LdStLoad, + "lhaux $rD, $addr", LdStLHAU, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lwaux $rD, $addr", LdStLoad, + "lwaux $rD, $addr", LdStLHAU, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">, isPPC64; } @@ -584,31 +589,31 @@ def LWZX8 : XForm_1<31, 23, (outs G8RC:$rD), (ins memrr:$src), // Update forms. let mayLoad = 1 in { def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lbzu $rD, $addr", LdStLoad, + "lbzu $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lhzu $rD, $addr", LdStLoad, + "lhzu $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lwzu $rD, $addr", LdStLoad, + "lwzu $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lbzux $rD, $addr", LdStLoad, + "lbzux $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lhzux $rD, $addr", LdStLoad, + "lhzux $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lwzux $rD, $addr", LdStLoad, + "lwzux $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; } @@ -624,6 +629,14 @@ def LDtoc: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg), "", [(set G8RC:$rD, (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64; +def LDtocJTI: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg), + "", + [(set G8RC:$rD, + (PPCtoc_entry tjumptable:$disp, G8RC:$reg))]>, isPPC64; +def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg), + "", + [(set G8RC:$rD, + (PPCtoc_entry tconstpool:$disp, G8RC:$reg))]>, isPPC64; let hasSideEffects = 1 in { let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo. @@ -642,13 +655,13 @@ def LDX : XForm_1<31, 21, (outs G8RC:$rD), (ins memrr:$src), let mayLoad = 1 in def LDU : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr), - "ldu $rD, $addr", LdStLD, + "ldu $rD, $addr", LdStLDU, []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64, NoEncode<"$ea_result">; def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "ldux $rD, $addr", LdStLoad, + "ldux $rD, $addr", LdStLDU, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">, isPPC64; } @@ -695,14 +708,14 @@ let PPC970_Unit = 2 in { def STBU8 : DForm_1a<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "stbu $rS, $ptroff($ptrreg)", LdStStore, + "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "sthu $rS, $ptroff($ptrreg)", LdStStore, + "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, @@ -710,7 +723,7 @@ def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS, def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "stwu $rS, $ptroff($ptrreg)", LdStStore, + "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, @@ -718,7 +731,7 @@ def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS, def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS, s16immX4:$ptroff, ptr_rc:$ptrreg), - "stdu $rS, $ptroff($ptrreg)", LdStSTD, + "stdu $rS, $ptroff($ptrreg)", LdStSTDU, [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">, @@ -727,7 +740,7 @@ def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS, def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res), (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stbux $rS, $ptroff, $ptrreg", LdStStore, + "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, @@ -736,7 +749,7 @@ def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res), def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res), (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "sthux $rS, $ptroff, $ptrreg", LdStStore, + "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, @@ -745,7 +758,7 @@ def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res), def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res), (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stwux $rS, $ptroff, $ptrreg", LdStStore, + "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, @@ -754,7 +767,7 @@ def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res), def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res), (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stdux $rS, $ptroff, $ptrreg", LdStStore, + "stdux $rS, $ptroff, $ptrreg", LdStSTDU, [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 47f09dc..d2df664 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -54,7 +54,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer( const TargetMachine *TM, const ScheduleDAG *DAG) const { unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective(); - if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) { + if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 || + Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) { const InstrItineraryData *II = TM->getInstrItineraryData(); return new PPCScoreboardHazardRecognizer(II, DAG); } @@ -70,7 +71,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); // Most subtargets use a PPC970 recognizer. - if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2) { + if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 && + Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) { const TargetInstrInfo *TII = TM.getInstrInfo(); assert(TII && "No InstrInfo?"); diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index f57f0c9..a503908 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -123,9 +123,11 @@ def PPCnop : SDNode<"PPCISD::NOP", SDT_PPCnop, [SDNPInGlue, SDNPOutGlue]>; def PPCload : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>, - [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + [SDNPHasChain, SDNPSideEffect, + SDNPInGlue, SDNPOutGlue]>; def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>, - [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + [SDNPHasChain, SDNPSideEffect, + SDNPInGlue, SDNPOutGlue]>; def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def PPCbctrl_Darwin : SDNode<"PPCISD::BCTRL_Darwin", SDTNone, @@ -153,6 +155,12 @@ def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, [SDNPHasChain, SDNPMayStore]>; +// Instructions to set/unset CR bit 6 for SVR4 vararg calls +def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + // Instructions to support atomic operations def PPClarx : SDNode<"PPCISD::LARX", SDT_PPClarx, [SDNPHasChain, SDNPMayLoad]>; @@ -330,9 +338,6 @@ def memrix : Operand<iPTR> { // memri where the imm is shifted 2 bits. let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg); let EncoderMethod = "getMemRIXEncoding"; } -def tocentry : Operand<iPTR> { - let MIOperandInfo = (ops i32imm:$imm); -} // PowerPC Predicate operand. 20 = (0<<5)|20 = always, CR0 is a dummy reg // that doesn't matter. @@ -673,7 +678,7 @@ def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src), [(set GPRC:$rD, (load iaddr:$src))]>; def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src), - "lfs $rD, $src", LdStLFDU, + "lfs $rD, $src", LdStLFD, [(set F4RC:$rD, (load iaddr:$src))]>; def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src), "lfd $rD, $src", LdStLFD, @@ -683,32 +688,32 @@ def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src), // Unindexed (r+i) Loads with Update (preinc). let mayLoad = 1 in { def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lbzu $rD, $addr", LdStLoad, + "lbzu $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lhau $rD, $addr", LdStLoad, + "lhau $rD, $addr", LdStLHAU, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lhzu $rD, $addr", LdStLoad, + "lhzu $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lwzu $rD, $addr", LdStLoad, + "lwzu $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lfs $rD, $addr", LdStLFDU, + "lfsu $rD, $addr", LdStLFDU, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), - "lfd $rD, $addr", LdStLFD, + "lfdu $rD, $addr", LdStLFDU, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; @@ -716,37 +721,37 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), // Indexed (r+r) Loads with Update (preinc). def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lbzux $rD, $addr", LdStLoad, + "lbzux $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lhaux $rD, $addr", LdStLoad, + "lhaux $rD, $addr", LdStLHAU, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lhzux $rD, $addr", LdStLoad, + "lhzux $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lwzux $rD, $addr", LdStLoad, + "lwzux $rD, $addr", LdStLoadUpd, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lfsux $rD, $addr", LdStLoad, + "lfsux $rD, $addr", LdStLFDU, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memrr:$addr), - "lfdux $rD, $addr", LdStLoad, + "lfdux $rD, $addr", LdStLFDU, []>, RegConstraint<"$addr.offreg = $ea_result">, NoEncode<"$ea_result">; } @@ -778,10 +783,10 @@ def LWBRX : XForm_1<31, 534, (outs GPRC:$rD), (ins memrr:$src), [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>; def LFSX : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src), - "lfsx $frD, $src", LdStLFDU, + "lfsx $frD, $src", LdStLFD, [(set F4RC:$frD, (load xaddr:$src))]>; def LFDX : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src), - "lfdx $frD, $src", LdStLFDU, + "lfdx $frD, $src", LdStLFD, [(set F8RC:$frD, (load xaddr:$src))]>; } @@ -801,10 +806,10 @@ def STW : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src), "stw $rS, $src", LdStStore, [(store GPRC:$rS, iaddr:$src)]>; def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst), - "stfs $rS, $dst", LdStUX, + "stfs $rS, $dst", LdStSTFD, [(store F4RC:$rS, iaddr:$dst)]>; def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst), - "stfd $rS, $dst", LdStUX, + "stfd $rS, $dst", LdStSTFD, [(store F8RC:$rS, iaddr:$dst)]>; } @@ -812,33 +817,33 @@ def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst), let PPC970_Unit = 2 in { def STBU : DForm_1a<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "stbu $rS, $ptroff($ptrreg)", LdStStore, + "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; def STHU : DForm_1a<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "sthu $rS, $ptroff($ptrreg)", LdStStore, + "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; def STWU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "stwu $rS, $ptroff($ptrreg)", LdStStore, + "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; def STFSU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "stfsu $rS, $ptroff($ptrreg)", LdStStore, + "stfsu $rS, $ptroff($ptrreg)", LdStSTFDU, [(set ptr_rc:$ea_res, (pre_store F4RC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS, symbolLo:$ptroff, ptr_rc:$ptrreg), - "stfdu $rS, $ptroff($ptrreg)", LdStStore, + "stfdu $rS, $ptroff($ptrreg)", LdStSTFDU, [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; @@ -863,7 +868,7 @@ def STWX : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst), def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res), (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stbux $rS, $ptroff, $ptrreg", LdStStore, + "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, @@ -872,7 +877,7 @@ def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res), def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res), (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "sthux $rS, $ptroff, $ptrreg", LdStStore, + "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, @@ -881,7 +886,7 @@ def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res), def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res), (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stwux $rS, $ptroff, $ptrreg", LdStStore, + "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd, [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, @@ -889,7 +894,7 @@ def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res), def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res), (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stfsux $rS, $ptroff, $ptrreg", LdStStore, + "stfsux $rS, $ptroff, $ptrreg", LdStSTFDU, [(set ptr_rc:$ea_res, (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, @@ -897,7 +902,7 @@ def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res), def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res), (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), - "stfdux $rS, $ptroff, $ptrreg", LdStStore, + "stfdux $rS, $ptroff, $ptrreg", LdStSTFDU, [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, @@ -913,14 +918,14 @@ def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst), PPC970_DGroup_Cracked; def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst), - "stfiwx $frS, $dst", LdStUX, + "stfiwx $frS, $dst", LdStSTFD, [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>; def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst), - "stfsx $frS, $dst", LdStUX, + "stfsx $frS, $dst", LdStSTFD, [(store F4RC:$frS, xaddr:$dst)]>; def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst), - "stfdx $frS, $dst", LdStUX, + "stfdx $frS, $dst", LdStSTFD, [(store F8RC:$frS, xaddr:$dst)]>; } @@ -964,7 +969,7 @@ def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm), [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>; } -let isReMaterializable = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def LI : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm), "li $rD, $imm", IntSimple, [(set GPRC:$rD, immSExt16:$imm)]>; @@ -1143,6 +1148,16 @@ def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins), "crxor $dst, $dst, $dst", BrCR, []>; +let Defs = [CR1EQ], CRD = 6 in { +def CR6SET : XLForm_1_ext<19, 289, (outs), (ins), + "creqv 6, 6, 6", BrCR, + [(PPCcr6set)]>; + +def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins), + "crxor 6, 6, 6", BrCR, + [(PPCcr6unset)]>; +} + // XFX-Form instructions. Instructions that deal with SPRs. // let Uses = [CTR] in { @@ -1233,7 +1248,7 @@ let Uses = [RM] in { PPC970_DGroup_Single, PPC970_Unit_FPU; def FADDrtz: AForm_2<63, 21, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB), - "fadd $FRT, $FRA, $FRB", FPGeneral, + "fadd $FRT, $FRA, $FRB", FPAddSub, [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>, PPC970_DGroup_Single, PPC970_Unit_FPU; } @@ -1364,7 +1379,7 @@ def FSELS : AForm_1<63, 23, let Uses = [RM] in { def FADD : AForm_2<63, 21, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB), - "fadd $FRT, $FRA, $FRB", FPGeneral, + "fadd $FRT, $FRA, $FRB", FPAddSub, [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>; def FADDS : AForm_2<59, 21, (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB), @@ -1388,7 +1403,7 @@ let Uses = [RM] in { [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>; def FSUB : AForm_2<63, 20, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB), - "fsub $FRT, $FRA, $FRB", FPGeneral, + "fsub $FRT, $FRA, $FRB", FPAddSub, [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>; def FSUBS : AForm_2<59, 20, (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB), diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td index 6a6ccb9..660c0c3 100644 --- a/lib/Target/PowerPC/PPCSchedule.td +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -40,6 +40,7 @@ def IntMulHWU : InstrItinClass; def IntMulLI : InstrItinClass; def IntRFID : InstrItinClass; def IntRotateD : InstrItinClass; +def IntRotateDI : InstrItinClass; def IntRotate : InstrItinClass; def IntShift : InstrItinClass; def IntTrapD : InstrItinClass; @@ -52,15 +53,18 @@ def LdStDCBA : InstrItinClass; def LdStDCBF : InstrItinClass; def LdStDCBI : InstrItinClass; def LdStLoad : InstrItinClass; +def LdStLoadUpd : InstrItinClass; def LdStStore : InstrItinClass; +def LdStStoreUpd : InstrItinClass; def LdStDSS : InstrItinClass; def LdStICBI : InstrItinClass; -def LdStUX : InstrItinClass; def LdStLD : InstrItinClass; +def LdStLDU : InstrItinClass; def LdStLDARX : InstrItinClass; def LdStLFD : InstrItinClass; def LdStLFDU : InstrItinClass; def LdStLHA : InstrItinClass; +def LdStLHAU : InstrItinClass; def LdStLMW : InstrItinClass; def LdStLVecX : InstrItinClass; def LdStLWA : InstrItinClass; @@ -69,6 +73,9 @@ def LdStSLBIA : InstrItinClass; def LdStSLBIE : InstrItinClass; def LdStSTD : InstrItinClass; def LdStSTDCX : InstrItinClass; +def LdStSTDU : InstrItinClass; +def LdStSTFD : InstrItinClass; +def LdStSTFDU : InstrItinClass; def LdStSTVEBX : InstrItinClass; def LdStSTWCX : InstrItinClass; def LdStSync : InstrItinClass; @@ -86,6 +93,7 @@ def SprMTSRIN : InstrItinClass; def SprRFI : InstrItinClass; def SprSC : InstrItinClass; def FPGeneral : InstrItinClass; +def FPAddSub : InstrItinClass; def FPCompare : InstrItinClass; def FPDivD : InstrItinClass; def FPDivS : InstrItinClass; @@ -110,6 +118,8 @@ include "PPCScheduleG4.td" include "PPCScheduleG4Plus.td" include "PPCScheduleG5.td" include "PPCScheduleA2.td" +include "PPCScheduleE500mc.td" +include "PPCScheduleE5500.td" //===----------------------------------------------------------------------===// // Instruction to itinerary class map - When add new opcodes to the supported @@ -171,7 +181,7 @@ include "PPCScheduleA2.td" // extsh IntSimple // extsw IntSimple // fabs FPGeneral -// fadd FPGeneral +// fadd FPAddSub // fadds FPGeneral // fcfid FPGeneral // fcmpo FPCompare @@ -201,35 +211,35 @@ include "PPCScheduleA2.td" // fsel FPGeneral // fsqrt FPSqrt // fsqrts FPSqrt -// fsub FPGeneral +// fsub FPAddSub // fsubs FPGeneral // icbi LdStICBI // isync SprISYNC // lbz LdStLoad -// lbzu LdStLoad -// lbzux LdStUX +// lbzu LdStLoadUpd +// lbzux LdStLoadUpd // lbzx LdStLoad // ld LdStLD // ldarx LdStLDARX -// ldu LdStLD -// ldux LdStLD +// ldu LdStLDU +// ldux LdStLDU // ldx LdStLD // lfd LdStLFD // lfdu LdStLFDU // lfdux LdStLFDU -// lfdx LdStLFDU -// lfs LdStLFDU +// lfdx LdStLFD +// lfs LdStLFD // lfsu LdStLFDU // lfsux LdStLFDU -// lfsx LdStLFDU +// lfsx LdStLFD // lha LdStLHA -// lhau LdStLHA -// lhaux LdStLHA +// lhau LdStLHAU +// lhaux LdStLHAU // lhax LdStLHA // lhbrx LdStLoad // lhz LdStLoad -// lhzu LdStLoad -// lhzux LdStUX +// lhzu LdStLoadUpd +// lhzux LdStLoadUpd // lhzx LdStLoad // lmw LdStLMW // lswi LdStLMW @@ -243,12 +253,12 @@ include "PPCScheduleA2.td" // lvxl LdStLVecX // lwa LdStLWA // lwarx LdStLWARX -// lwaux LdStLHA +// lwaux LdStLHAU // lwax LdStLHA // lwbrx LdStLoad // lwz LdStLoad -// lwzu LdStLoad -// lwzux LdStUX +// lwzu LdStLoadUpd +// lwzux LdStLoadUpd // lwzx LdStLoad // mcrf BrMCR // mcrfs FPGeneral @@ -292,10 +302,10 @@ include "PPCScheduleA2.td" // rfid IntRFID // rldcl IntRotateD // rldcr IntRotateD -// rldic IntRotateD -// rldicl IntRotateD -// rldicr IntRotateD -// rldimi IntRotateD +// rldic IntRotateDI +// rldicl IntRotateDI +// rldicr IntRotateDI +// rldimi IntRotateDI // rlwimi IntRotate // rlwinm IntGeneral // rlwnm IntGeneral @@ -305,33 +315,33 @@ include "PPCScheduleA2.td" // sld IntRotateD // slw IntGeneral // srad IntRotateD -// sradi IntRotateD +// sradi IntRotateDI // sraw IntShift // srawi IntShift // srd IntRotateD // srw IntGeneral // stb LdStStore -// stbu LdStStore -// stbux LdStStore +// stbu LdStStoreUpd +// stbux LdStStoreUpd // stbx LdStStore // std LdStSTD // stdcx. LdStSTDCX -// stdu LdStSTD -// stdux LdStSTD +// stdu LdStSTDU +// stdux LdStSTDU // stdx LdStSTD -// stfd LdStUX -// stfdu LdStUX -// stfdux LdStUX -// stfdx LdStUX -// stfiwx LdStUX -// stfs LdStUX -// stfsu LdStUX -// stfsux LdStUX -// stfsx LdStUX +// stfd LdStSTFD +// stfdu LdStSTFDU +// stfdux LdStSTFDU +// stfdx LdStSTFD +// stfiwx LdStSTFD +// stfs LdStSTFD +// stfsu LdStSTFDU +// stfsux LdStSTFDU +// stfsx LdStSTFD // sth LdStStore // sthbrx LdStStore -// sthu LdStStore -// sthux LdStStore +// sthu LdStStoreUpd +// sthux LdStStoreUpd // sthx LdStStore // stmw LdStLMW // stswi LdStLMW @@ -344,8 +354,8 @@ include "PPCScheduleA2.td" // stw LdStStore // stwbrx LdStStore // stwcx. LdStSTWCX -// stwu LdStStore -// stwux LdStStore +// stwu LdStStoreUpd +// stwux LdStStoreUpd // stwx LdStStore // subf IntGeneral // subfc IntGeneral diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td index cd0fb70..37b6eac 100644 --- a/lib/Target/PowerPC/PPCSchedule440.td +++ b/lib/Target/PowerPC/PPCSchedule440.td @@ -288,6 +288,15 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<2, [LWB]>], [9, 5], [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [LRACC]>, + InstrStage<1, [AGEN]>, + InstrStage<1, [CRD]>, + InstrStage<2, [LWB]>], + [9, 5], + [GPR_Bypass, GPR_Bypass]>, InstrItinData<LdStStore , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, @@ -297,6 +306,15 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<2, [LWB]>], [8, 5], [NoBypass, GPR_Bypass]>, + InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [LRACC]>, + InstrStage<1, [AGEN]>, + InstrStage<1, [CRD]>, + InstrStage<2, [LWB]>], + [8, 5], + [NoBypass, GPR_Bypass]>, InstrItinData<LdStICBI , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, @@ -306,7 +324,7 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<1, [LWB]>], [8, 5], [NoBypass, GPR_Bypass]>, - InstrItinData<LdStUX , [InstrStage<1, [IFTH1, IFTH2]>, + InstrItinData<LdStSTFD , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, InstrStage<1, [LRACC]>, @@ -315,6 +333,15 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<1, [LWB]>], [8, 5, 5], [NoBypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStSTFDU , [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [LRACC]>, + InstrStage<1, [AGEN]>, + InstrStage<1, [CRD]>, + InstrStage<1, [LWB]>], + [8, 5, 5], + [NoBypass, GPR_Bypass, GPR_Bypass]>, InstrItinData<LdStLFD , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, @@ -342,6 +369,15 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<1, [LWB]>], [8, 5], [NoBypass, GPR_Bypass]>, + InstrItinData<LdStLHAU , [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [LRACC]>, + InstrStage<1, [AGEN]>, + InstrStage<1, [CRD]>, + InstrStage<1, [LWB]>], + [8, 5], + [NoBypass, GPR_Bypass]>, InstrItinData<LdStLMW , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, @@ -371,6 +407,15 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<2, [LWB]>], [8, 5], [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSTDU , [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [LRACC]>, + InstrStage<1, [AGEN]>, + InstrStage<1, [CRD]>, + InstrStage<2, [LWB]>], + [8, 5], + [NoBypass, GPR_Bypass]>, InstrItinData<LdStSTDCX , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1]>, @@ -537,6 +582,19 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<1, [FWB]>], [10, 4, 4], [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPAddSub , [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [FRACC]>, + InstrStage<1, [FEXE1]>, + InstrStage<1, [FEXE2]>, + InstrStage<1, [FEXE3]>, + InstrStage<1, [FEXE4]>, + InstrStage<1, [FEXE5]>, + InstrStage<1, [FEXE6]>, + InstrStage<1, [FWB]>], + [10, 4, 4], + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, InstrItinData<FPCompare , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td index 4d4a5d0..ba63b5c 100644 --- a/lib/Target/PowerPC/PPCScheduleA2.td +++ b/lib/Target/PowerPC/PPCScheduleA2.td @@ -181,6 +181,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [10, 7, 7], [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntRotateDI , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [10, 7, 7], + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, InstrItinData<IntShift , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -302,7 +313,18 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [14, 7], [GPR_Bypass, GPR_Bypass]>, - InstrItinData<LdStLD , [InstrStage<4, + InstrItinData<LdStLoadUpd , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [14, 7], + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLDU , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7]>, @@ -324,6 +346,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [13, 7], [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStStoreUpd, [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [13, 7], + [GPR_Bypass, GPR_Bypass]>, InstrItinData<LdStICBI , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -335,7 +368,7 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [14, 7], [NoBypass, GPR_Bypass]>, - InstrItinData<LdStUX , [InstrStage<4, + InstrItinData<LdStSTFD , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7]>, @@ -346,6 +379,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [14, 7, 7], [NoBypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<LdStSTFDU , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [14, 7, 7], + [NoBypass, FPR_Bypass, FPR_Bypass]>, InstrItinData<LdStLFD , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -379,6 +423,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [14, 7], [NoBypass, GPR_Bypass]>, + InstrItinData<LdStLHAU , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [14, 7], + [NoBypass, GPR_Bypass]>, InstrItinData<LdStLMW , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -412,6 +467,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [13, 7], [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStSTDU , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [13, 7], + [GPR_Bypass, GPR_Bypass]>, InstrItinData<LdStSTDCX , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -593,6 +659,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>], [15, 7, 7], [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPAddSub , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>, + InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>, + InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>, + InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>], + [15, 7, 7], + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, InstrItinData<FPCompare , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td new file mode 100644 index 0000000..9bb779a --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleE500mc.td @@ -0,0 +1,265 @@ +//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Freescale e500mc 32-bit +// Power processor. +// +// All information is derived from the "e500mc Core Reference Manual", +// Freescale Document Number E500MCRM, Rev. 1, 03/2012. +// +//===----------------------------------------------------------------------===// +// Relevant functional units in the Freescale e500mc core: +// +// * Decode & Dispatch +// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue +// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ). +def DIS0 : FuncUnit; // Dispatch stage - insn 1 +def DIS1 : FuncUnit; // Dispatch stage - insn 2 + +// * Execute +// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX. +// Some instructions can only execute in SFX0 but not SFX1. +// The CFX has a bypass path, allowing non-divide instructions to execute +// while a divide instruction is executed. +def SFX0 : FuncUnit; // Simple unit 0 +def SFX1 : FuncUnit; // Simple unit 1 +def BU : FuncUnit; // Branch unit +def CFX_DivBypass + : FuncUnit; // CFX divide bypass path +def CFX_0 : FuncUnit; // CFX pipeline +def LSU_0 : FuncUnit; // LSU pipeline +def FPU_0 : FuncUnit; // FPU pipeline + +def PPCE500mcItineraries : ProcessorItineraries< + [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0], + [CR_Bypass, GPR_Bypass, FPR_Bypass], [ + InstrItinData<IntSimple , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1, 1], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntGeneral , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1, 1], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntCompare , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [5, 1, 1], // Latency = 1 or 2 + [CR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntDivW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<14, [CFX_DivBypass]>], + [17, 1, 1], // Latency=4..35, Repeat= 4..35 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMFFS , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<8, [FPU_0]>], + [11], // Latency = 8 + [FPR_Bypass]>, + InstrItinData<IntMTFSB0 , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<8, [FPU_0]>], + [11, 1, 1], // Latency = 8 + [NoBypass, NoBypass, NoBypass]>, + InstrItinData<IntMulHW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMulHWU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMulLI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntRotate , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1, 1], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntShift , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1, 1], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntTrapW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [SFX0]>], + [5, 1], // Latency = 2, Repeat rate = 2 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<BrB , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [BU]>], + [4, 1], // Latency = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<BrCR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [BU]>], + [4, 1, 1], // Latency = 1 + [CR_Bypass, CR_Bypass, CR_Bypass]>, + InstrItinData<BrMCR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [BU]>], + [4, 1], // Latency = 1 + [CR_Bypass, CR_Bypass]>, + InstrItinData<BrMCRX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1, 1], // Latency = 1 + [CR_Bypass, GPR_Bypass]>, + InstrItinData<LdStDCBA , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStDCBF , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStDCBI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLoad , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStStore , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStICBI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSTFD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStSTFDU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLFD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 1, 1], // Latency = 4 + [FPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLFDU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 1, 1], // Latency = 4 + [FPR_Bypass, GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLHA , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLHAU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLMW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 1], // Latency = r+3 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStLWARX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<3, [LSU_0]>], + [6, 1, 1], // Latency = 3, Repeat rate = 3 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStSTWCX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSync , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>]>, + InstrItinData<SprMFSR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [SFX0]>], + [7, 1], + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<SprMTMSR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [SFX0, SFX1]>], + [5, 1], // Latency = 2, Repeat rate = 4 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<SprMTSR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0]>], + [5, 1], + [NoBypass, GPR_Bypass]>, + InstrItinData<SprTLBSYNC , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0], 0>]>, + InstrItinData<SprMFCR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<5, [SFX0]>], + [8, 1], + [GPR_Bypass, CR_Bypass]>, + InstrItinData<SprMFMSR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [SFX0]>], + [7, 1], // Latency = 4, Repeat rate = 4 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<SprMFSPR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1], // Latency = 1, Repeat rate = 1 + [GPR_Bypass, CR_Bypass]>, + InstrItinData<SprMFTB , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [SFX0]>], + [7, 1], // Latency = 4, Repeat rate = 4 + [NoBypass, GPR_Bypass]>, + InstrItinData<SprMTSPR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [4, 1], // Latency = 1, Repeat rate = 1 + [CR_Bypass, GPR_Bypass]>, + InstrItinData<SprMTSRIN , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0]>], + [4, 1], + [NoBypass, GPR_Bypass]>, + InstrItinData<FPGeneral , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [FPU_0]>], + [11, 1, 1], // Latency = 8, Repeat rate = 2 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPAddSub , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [FPU_0]>], + [13, 1, 1], // Latency = 10, Repeat rate = 4 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPCompare , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [FPU_0]>], + [11, 1, 1], // Latency = 8, Repeat rate = 2 + [CR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPDivD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<68, [FPU_0]>], + [71, 1, 1], // Latency = 68, Repeat rate = 68 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPDivS , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<38, [FPU_0]>], + [41, 1, 1], // Latency = 38, Repeat rate = 38 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPFused , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [FPU_0]>], + [13, 1, 1, 1], // Latency = 10, Repeat rate = 4 + [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPRes , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<38, [FPU_0]>], + [41, 1], // Latency = 38, Repeat rate = 38 + [FPR_Bypass, FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// e500mc machine model for scheduling and other instruction cost heuristics. + +def PPCE500mcModel : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 5; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPCE500mcItineraries; +} diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td new file mode 100644 index 0000000..d7e11ac --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleE5500.td @@ -0,0 +1,309 @@ +//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Freescale e5500 64-bit +// Power processor. +// +// All information is derived from the "e5500 Core Reference Manual", +// Freescale Document Number e5500RM, Rev. 1, 03/2012. +// +//===----------------------------------------------------------------------===// +// Relevant functional units in the Freescale e5500 core +// (These are the same as for the e500mc) +// +// * Decode & Dispatch +// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue +// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ). +// def DIS0 : FuncUnit; +// def DIS1 : FuncUnit; + +// * Execute +// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX. +// The CFX has a bypass path, allowing non-divide instructions to execute +// while a divide instruction is being executed. +// def SFX0 : FuncUnit; // Simple unit 0 +// def SFX1 : FuncUnit; // Simple unit 1 +// def BU : FuncUnit; // Branch unit +// def CFX_DivBypass +// : FuncUnit; // CFX divide bypass path +// def CFX_0 : FuncUnit; // CFX pipeline stage 0 + +def CFX_1 : FuncUnit; // CFX pipeline stage 1 + +// def LSU_0 : FuncUnit; // LSU pipeline +// def FPU_0 : FuncUnit; // FPU pipeline + + +def PPCE5500Itineraries : ProcessorItineraries< + [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1, + LSU_0, FPU_0], + [CR_Bypass, GPR_Bypass, FPR_Bypass], [ + InstrItinData<IntSimple , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [5, 2, 2], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntGeneral , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [5, 2, 2], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntCompare , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [6, 2, 2], // Latency = 1 or 2 + [CR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntDivD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<26, [CFX_DivBypass]>], + [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntDivW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<16, [CFX_DivBypass]>], + [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMFFS , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [FPU_0]>], + [11], // Latency = 7, Repeat rate = 1 + [FPR_Bypass]>, + InstrItinData<IntMTFSB0 , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<7, [FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 7 + [NoBypass, NoBypass, NoBypass]>, + InstrItinData<IntMulHD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<2, [CFX_1]>], + [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMulHW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<1, [CFX_1]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMulHWU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<1, [CFX_1]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntMulLI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0], 0>, + InstrStage<2, [CFX_1]>], + [8, 2, 2], // Latency = 4 or 5, Repeat = 2 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntRotate , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [5, 2, 2], // Latency = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntRotateD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [SFX0, SFX1]>], + [6, 2, 2], // Latency = 2, Repeat rate = 2 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntRotateDI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [5, 2, 2], // Latency = 1, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntShift , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [SFX0, SFX1]>], + [6, 2, 2], // Latency = 2, Repeat rate = 2 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntTrapW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [SFX0]>], + [6, 2], // Latency = 2, Repeat rate = 2 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<BrB , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [BU]>], + [5, 2], // Latency = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<BrCR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [BU]>], + [5, 2, 2], // Latency = 1 + [CR_Bypass, CR_Bypass, CR_Bypass]>, + InstrItinData<BrMCR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [BU]>], + [5, 2], // Latency = 1 + [CR_Bypass, CR_Bypass]>, + InstrItinData<BrMCRX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0]>], + [5, 2, 2], // Latency = 1 + [CR_Bypass, GPR_Bypass]>, + InstrItinData<LdStDCBA , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStDCBF , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStDCBI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLoad , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLDARX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<3, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLDU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStStore , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStICBI , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSTFD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStSTFDU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLFD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [FPR_Bypass, GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLFDU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [FPR_Bypass, GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLHA , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLHAU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [GPR_Bypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStLMW , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [LSU_0]>], + [8, 2], // Latency = r+3, Repeat rate = r+3 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStLWARX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<3, [LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 3 + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStSTD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSTDCX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSTDU , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<LdStSTWCX , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, GPR_Bypass]>, + InstrItinData<LdStSync , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0]>]>, + InstrItinData<SprMTMSR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [CFX_0]>], + [6, 2], // Latency = 2, Repeat rate = 4 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<SprTLBSYNC , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [LSU_0], 0>]>, + InstrItinData<SprMFCR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<5, [CFX_0]>], + [9, 2], // Latency = 5, Repeat rate = 5 + [GPR_Bypass, CR_Bypass]>, + InstrItinData<SprMFMSR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [SFX0]>], + [8, 2], // Latency = 4, Repeat rate = 4 + [GPR_Bypass, GPR_Bypass]>, + InstrItinData<SprMFSPR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [CFX_0]>], + [5], // Latency = 1, Repeat rate = 1 + [GPR_Bypass]>, + InstrItinData<SprMFTB , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<4, [CFX_0]>], + [8, 2], // Latency = 4, Repeat rate = 4 + [NoBypass, GPR_Bypass]>, + InstrItinData<SprMTSPR , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [SFX0, SFX1]>], + [5], // Latency = 1, Repeat rate = 1 + [GPR_Bypass]>, + InstrItinData<FPGeneral , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPAddSub , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPCompare , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [CR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPDivD , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<31, [FPU_0]>], + [39, 2, 2], // Latency = 35, Repeat rate = 31 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPDivS , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<16, [FPU_0]>], + [24, 2, 2], // Latency = 20, Repeat rate = 16 + [FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPFused , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<1, [FPU_0]>], + [11, 2, 2, 2], // Latency = 7, Repeat rate = 1 + [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>, + InstrItinData<FPRes , [InstrStage<1, [DIS0, DIS1], 0>, + InstrStage<2, [FPU_0]>], + [12, 2], // Latency = 8, Repeat rate = 2 + [FPR_Bypass, FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// e5500 machine model for scheduling and other instruction cost heuristics. + +def PPCE5500Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 6; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPCE5500Itineraries; +} diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td index 61e89ed..72a0a39 100644 --- a/lib/Target/PowerPC/PPCScheduleG3.td +++ b/lib/Target/PowerPC/PPCScheduleG3.td @@ -34,12 +34,16 @@ def G3Itineraries : ProcessorItineraries< InstrItinData<LdStDCBF , [InstrStage<3, [SLU]>]>, InstrItinData<LdStDCBI , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLoad , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>, InstrItinData<LdStStore , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>, InstrItinData<LdStICBI , [InstrStage<3, [SLU]>]>, - InstrItinData<LdStUX , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStSTFD , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStSTFDU , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLFD , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLFDU , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLHA , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLHAU , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLMW , [InstrStage<34, [SLU]>]>, InstrItinData<LdStLWARX , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSTWCX , [InstrStage<8, [SLU]>]>, @@ -58,6 +62,7 @@ def G3Itineraries : ProcessorItineraries< InstrItinData<SprRFI , [InstrStage<2, [SRU]>]>, InstrItinData<SprSC , [InstrStage<2, [SRU]>]>, InstrItinData<FPGeneral , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPAddSub , [InstrStage<1, [FPU1]>]>, InstrItinData<FPCompare , [InstrStage<1, [FPU1]>]>, InstrItinData<FPDivD , [InstrStage<31, [FPU1]>]>, InstrItinData<FPDivS , [InstrStage<17, [FPU1]>]>, diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td index e19ddfa..fc9120d 100644 --- a/lib/Target/PowerPC/PPCScheduleG4.td +++ b/lib/Target/PowerPC/PPCScheduleG4.td @@ -33,13 +33,17 @@ def G4Itineraries : ProcessorItineraries< InstrItinData<LdStDCBF , [InstrStage<2, [SLU]>]>, InstrItinData<LdStDCBI , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLoad , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>, InstrItinData<LdStStore , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>, InstrItinData<LdStDSS , [InstrStage<2, [SLU]>]>, InstrItinData<LdStICBI , [InstrStage<2, [SLU]>]>, - InstrItinData<LdStUX , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStSTFD , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStSTFDU , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLFD , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLFDU , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLHA , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLHAU , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLMW , [InstrStage<34, [SLU]>]>, InstrItinData<LdStLVecX , [InstrStage<2, [SLU]>]>, InstrItinData<LdStLWARX , [InstrStage<3, [SLU]>]>, @@ -60,6 +64,7 @@ def G4Itineraries : ProcessorItineraries< InstrItinData<SprRFI , [InstrStage<2, [SRU]>]>, InstrItinData<SprSC , [InstrStage<2, [SRU]>]>, InstrItinData<FPGeneral , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPAddSub , [InstrStage<1, [FPU1]>]>, InstrItinData<FPCompare , [InstrStage<1, [FPU1]>]>, InstrItinData<FPDivD , [InstrStage<31, [FPU1]>]>, InstrItinData<FPDivS , [InstrStage<17, [FPU1]>]>, diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td index e7446cb..a4e82ce 100644 --- a/lib/Target/PowerPC/PPCScheduleG4Plus.td +++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -36,19 +36,24 @@ def G4PlusItineraries : ProcessorItineraries< InstrItinData<LdStDCBF , [InstrStage<3, [SLU]>]>, InstrItinData<LdStDCBI , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLoad , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>, InstrItinData<LdStStore , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>, InstrItinData<LdStDSS , [InstrStage<3, [SLU]>]>, InstrItinData<LdStICBI , [InstrStage<3, [IU2]>]>, - InstrItinData<LdStUX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTFD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTFDU , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLFD , [InstrStage<4, [SLU]>]>, InstrItinData<LdStLFDU , [InstrStage<4, [SLU]>]>, InstrItinData<LdStLHA , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLHAU , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLMW , [InstrStage<37, [SLU]>]>, InstrItinData<LdStLVecX , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLWA , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLWARX , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSTD , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSTDCX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTDU , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSTVEBX , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSTWCX , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSync , [InstrStage<35, [SLU]>]>, @@ -66,6 +71,7 @@ def G4PlusItineraries : ProcessorItineraries< InstrItinData<SprRFI , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, InstrItinData<SprSC , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>, InstrItinData<FPGeneral , [InstrStage<5, [FPU1]>]>, + InstrItinData<FPAddSub , [InstrStage<5, [FPU1]>]>, InstrItinData<FPCompare , [InstrStage<5, [FPU1]>]>, InstrItinData<FPDivD , [InstrStage<35, [FPU1]>]>, InstrItinData<FPDivS , [InstrStage<21, [FPU1]>]>, diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td index 1371499..7c02ea0 100644 --- a/lib/Target/PowerPC/PPCScheduleG5.td +++ b/lib/Target/PowerPC/PPCScheduleG5.td @@ -27,6 +27,7 @@ def G5Itineraries : ProcessorItineraries< InstrItinData<IntMulLI , [InstrStage<4, [IU1, IU2]>]>, InstrItinData<IntRFID , [InstrStage<1, [IU2]>]>, InstrItinData<IntRotateD , [InstrStage<2, [IU1, IU2]>]>, + InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>, InstrItinData<IntRotate , [InstrStage<4, [IU1, IU2]>]>, InstrItinData<IntShift , [InstrStage<2, [IU1, IU2]>]>, InstrItinData<IntTrapD , [InstrStage<1, [IU1, IU2]>]>, @@ -37,15 +38,20 @@ def G5Itineraries : ProcessorItineraries< InstrItinData<BrMCRX , [InstrStage<3, [BPU]>]>, InstrItinData<LdStDCBF , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLoad , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>, InstrItinData<LdStStore , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>, InstrItinData<LdStDSS , [InstrStage<10, [SLU]>]>, InstrItinData<LdStICBI , [InstrStage<40, [SLU]>]>, - InstrItinData<LdStUX , [InstrStage<4, [SLU]>]>, + InstrItinData<LdStSTFD , [InstrStage<4, [SLU]>]>, + InstrItinData<LdStSTFDU , [InstrStage<4, [SLU]>]>, InstrItinData<LdStLD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLDU , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLDARX , [InstrStage<11, [SLU]>]>, InstrItinData<LdStLFD , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLFDU , [InstrStage<5, [SLU]>]>, InstrItinData<LdStLHA , [InstrStage<5, [SLU]>]>, + InstrItinData<LdStLHAU , [InstrStage<5, [SLU]>]>, InstrItinData<LdStLMW , [InstrStage<64, [SLU]>]>, InstrItinData<LdStLVecX , [InstrStage<3, [SLU]>]>, InstrItinData<LdStLWA , [InstrStage<5, [SLU]>]>, @@ -53,6 +59,7 @@ def G5Itineraries : ProcessorItineraries< InstrItinData<LdStSLBIA , [InstrStage<40, [SLU]>]>, // needs work InstrItinData<LdStSLBIE , [InstrStage<2, [SLU]>]>, InstrItinData<LdStSTD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTDU , [InstrStage<3, [SLU]>]>, InstrItinData<LdStSTDCX , [InstrStage<11, [SLU]>]>, InstrItinData<LdStSTVEBX , [InstrStage<5, [SLU]>]>, InstrItinData<LdStSTWCX , [InstrStage<11, [SLU]>]>, @@ -69,6 +76,7 @@ def G5Itineraries : ProcessorItineraries< InstrItinData<SprMTSPR , [InstrStage<8, [IU2]>]>, InstrItinData<SprSC , [InstrStage<1, [IU2]>]>, InstrItinData<FPGeneral , [InstrStage<6, [FPU1, FPU2]>]>, + InstrItinData<FPAddSub , [InstrStage<6, [FPU1, FPU2]>]>, InstrItinData<FPCompare , [InstrStage<8, [FPU1, FPU2]>]>, InstrItinData<FPDivD , [InstrStage<33, [FPU1, FPU2]>]>, InstrItinData<FPDivS , [InstrStage<33, [FPU1, FPU2]>]>, diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 0207c83..b8b1614 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -41,6 +41,8 @@ namespace PPC { DIR_750, DIR_970, DIR_A2, + DIR_E500mc, + DIR_E5500, DIR_PWR6, DIR_PWR7, DIR_64 diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index 15541ef..e64c140 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -129,7 +129,7 @@ def retflag : SDNode<"SPISD::RET_FLAG", SDT_SPRet, [SDNPHasChain, SDNPOptInGlue]>; def flushw : SDNode<"SPISD::FLUSHW", SDTNone, - [SDNPHasChain]>; + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; def getPCX : Operand<i32> { let PrintMethod = "printGetPCX"; diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp index 8e215a7..62f973e 100644 --- a/lib/Target/TargetLibraryInfo.cpp +++ b/lib/Target/TargetLibraryInfo.cpp @@ -24,6 +24,16 @@ void TargetLibraryInfo::anchor() { } const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = { + "_ZdaPv", + "_ZdlPv", + "_Znaj", + "_ZnajRKSt9nothrow_t", + "_Znam", + "_ZnamRKSt9nothrow_t", + "_Znwj", + "_ZnwjRKSt9nothrow_t", + "_Znwm", + "_ZnwmRKSt9nothrow_t", "__cxa_atexit", "__cxa_guard_abort", "__cxa_guard_acquire", @@ -31,16 +41,29 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "__memcpy_chk", "acos", "acosf", + "acosh", + "acoshf", + "acoshl", "acosl", "asin", "asinf", + "asinh", + "asinhf", + "asinhl", "asinl", "atan", "atan2", "atan2f", "atan2l", "atanf", + "atanh", + "atanhf", + "atanhl", "atanl", + "calloc", + "cbrt", + "cbrtf", + "cbrtl", "ceil", "ceilf", "ceill", @@ -54,6 +77,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "coshl", "cosl", "exp", + "exp10", + "exp10f", + "exp10l", "exp2", "exp2f", "exp2l", @@ -74,6 +100,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "fmodl", "fputc", "fputs", + "free", "fwrite", "iprintf", "log", @@ -86,8 +113,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "log2", "log2f", "log2l", + "logb", + "logbf", + "logbl", "logf", "logl", + "malloc", "memchr", "memcmp", "memcpy", @@ -97,11 +128,14 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "nearbyint", "nearbyintf", "nearbyintl", + "posix_memalign", "pow", "powf", "powl", "putchar", "puts", + "realloc", + "reallocf", "rint", "rintf", "rintl", @@ -121,10 +155,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "strcat", "strchr", "strcpy", + "strdup", "strlen", "strncat", "strncmp", "strncpy", + "strndup", "strnlen", "tan", "tanf", @@ -134,7 +170,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] = "tanl", "trunc", "truncf", - "truncl" + "truncl", + "valloc" }; /// initialize - Initialize the set of available library functions based on the @@ -205,6 +242,21 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T, TLI.setUnavailable(LibFunc::tanhl); // Win32 only has C89 math + TLI.setUnavailable(LibFunc::acosh); + TLI.setUnavailable(LibFunc::acoshf); + TLI.setUnavailable(LibFunc::acoshl); + TLI.setUnavailable(LibFunc::asinh); + TLI.setUnavailable(LibFunc::asinhf); + TLI.setUnavailable(LibFunc::asinhl); + TLI.setUnavailable(LibFunc::atanh); + TLI.setUnavailable(LibFunc::atanhf); + TLI.setUnavailable(LibFunc::atanhl); + TLI.setUnavailable(LibFunc::cbrt); + TLI.setUnavailable(LibFunc::cbrtf); + TLI.setUnavailable(LibFunc::cbrtl); + TLI.setUnavailable(LibFunc::exp10); + TLI.setUnavailable(LibFunc::exp10f); + TLI.setUnavailable(LibFunc::exp10l); TLI.setUnavailable(LibFunc::exp2); TLI.setUnavailable(LibFunc::exp2f); TLI.setUnavailable(LibFunc::exp2l); @@ -217,6 +269,9 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T, TLI.setUnavailable(LibFunc::log1p); TLI.setUnavailable(LibFunc::log1pf); TLI.setUnavailable(LibFunc::log1pl); + TLI.setUnavailable(LibFunc::logb); + TLI.setUnavailable(LibFunc::logbf); + TLI.setUnavailable(LibFunc::logbl); TLI.setUnavailable(LibFunc::nearbyint); TLI.setUnavailable(LibFunc::nearbyintf); TLI.setUnavailable(LibFunc::nearbyintl); diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 73a0095..c89e738 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -67,12 +67,19 @@ private: SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out); - bool MatchInstruction(SMLoc IDLoc, + bool MatchInstruction(SMLoc IDLoc, unsigned &Kind, SmallVectorImpl<MCParsedAsmOperand*> &Operands, SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo, bool matchingInlineAsm = false); + unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + unsigned OperandNum, unsigned &NumMCOperands) { + return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, + NumMCOperands); + } + /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode. bool isSrcOp(X86Operand &Op); @@ -514,12 +521,13 @@ bool X86AsmParser::isDstOp(X86Operand &Op) { bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { RegNo = 0; - if (!isParsingIntelSyntax()) { - const AsmToken &TokPercent = Parser.getTok(); - assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!"); - StartLoc = TokPercent.getLoc(); + const AsmToken &PercentTok = Parser.getTok(); + StartLoc = PercentTok.getLoc(); + + // If we encounter a %, ignore it. This code handles registers with and + // without the prefix, unprefixed registers can occur in cfi directives. + if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) Parser.Lex(); // Eat percent token. - } const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) { @@ -1516,9 +1524,12 @@ bool X86AsmParser:: MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out) { - SmallVector<MCInst, 2> Insts; + unsigned Kind; unsigned ErrorInfo; - bool Error = MatchInstruction(IDLoc, Operands, Insts, ErrorInfo); + SmallVector<MCInst, 2> Insts; + + bool Error = MatchInstruction(IDLoc, Kind, Operands, Insts, + ErrorInfo); if (!Error) for (unsigned i = 0, e = Insts.size(); i != e; ++i) Out.EmitInstruction(Insts[i]); @@ -1526,7 +1537,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, } bool X86AsmParser:: -MatchInstruction(SMLoc IDLoc, +MatchInstruction(SMLoc IDLoc, unsigned &Kind, SmallVectorImpl<MCParsedAsmOperand*> &Operands, SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo, bool matchingInlineAsm) { @@ -1537,7 +1548,7 @@ MatchInstruction(SMLoc IDLoc, // First, handle aliases that expand to multiple instructions. // FIXME: This should be replaced with a real .td file alias mechanism. - // Also, MatchInstructionImpl should do actually *do* the EmitInstruction + // Also, MatchInstructionImpl should actually *do* the EmitInstruction // call. if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" || Op->getToken() == "fstsww" || Op->getToken() == "fstcww" || @@ -1568,7 +1579,7 @@ MatchInstruction(SMLoc IDLoc, MCInst Inst; // First, try a direct match. - switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo, + switch (MatchInstructionImpl(Operands, Kind, Inst, OrigErrorInfo, isParsingIntelSyntax())) { default: break; case Match_Success: @@ -1585,9 +1596,6 @@ MatchInstruction(SMLoc IDLoc, Error(IDLoc, "instruction requires a CPU feature not currently enabled", EmptyRanges, matchingInlineAsm); return true; - case Match_ConversionFail: - return Error(IDLoc, "unable to convert operands to instruction", - EmptyRanges, matchingInlineAsm); case Match_InvalidOperand: WasOriginallyInvalidOperand = true; break; @@ -1619,14 +1627,19 @@ MatchInstruction(SMLoc IDLoc, Tmp[Base.size()] = Suffixes[0]; unsigned ErrorInfoIgnore; unsigned Match1, Match2, Match3, Match4; + unsigned tKind; - Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match1 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match1 == Match_Success) Kind = tKind; Tmp[Base.size()] = Suffixes[1]; - Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match2 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match2 == Match_Success) Kind = tKind; Tmp[Base.size()] = Suffixes[2]; - Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match3 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match3 == Match_Success) Kind = tKind; Tmp[Base.size()] = Suffixes[3]; - Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match4 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match4 == Match_Success) Kind = tKind; // Restore the old token. Op->setTokenValue(Base); @@ -1677,8 +1690,10 @@ MatchInstruction(SMLoc IDLoc, if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { if (!WasOriginallyInvalidOperand) { + ArrayRef<SMRange> Ranges = matchingInlineAsm ? EmptyRanges : + Op->getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", - Op->getLocRange(), matchingInlineAsm); + Ranges, matchingInlineAsm); } // Recover location info for the operand if we know which was the problem. @@ -1730,7 +1745,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return ParseDirectiveWord(2, DirectiveID.getLoc()); else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); - else if (IDVal.startswith(".intel_syntax")) { + else if (IDVal.startswith(".att_syntax")) { + getParser().setAssemblerDialect(0); + return false; + } else if (IDVal.startswith(".intel_syntax")) { getParser().setAssemblerDialect(1); if (getLexer().isNot(AsmToken::EndOfStatement)) { if(Parser.getTok().getString() == "noprefix") { diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 5039887..f136927 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -44,7 +44,7 @@ void x86DisassemblerDebug(const char *file, dbgs() << file << ":" << line << ": " << s; } -const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) { +const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) { const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); return MII->getName(Opcode); } @@ -95,8 +95,8 @@ const EDInstInfo *X86GenericDisassembler::getEDInfo() const { /// be a pointer to a MemoryObject. /// @param byte - A pointer to the byte to be read. /// @param address - The address to be read. -static int regionReader(void* arg, uint8_t* byte, uint64_t address) { - MemoryObject* region = static_cast<MemoryObject*>(arg); +static int regionReader(const void* arg, uint8_t* byte, uint64_t address) { + const MemoryObject* region = static_cast<const MemoryObject*>(arg); return region->readByte(address, byte); } @@ -135,10 +135,10 @@ X86GenericDisassembler::getInstruction(MCInst &instr, int ret = decodeInstruction(&internalInstr, regionReader, - (void*)®ion, + (const void*)®ion, loggerFn, (void*)&vStream, - (void*)MII, + (const void*)MII, address, fMode); @@ -379,6 +379,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, } switch (type) { + case TYPE_XMM32: + case TYPE_XMM64: case TYPE_XMM128: mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); return; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index 0c92912..af444d1 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -200,7 +200,7 @@ static void unconsumeByte(struct InternalInstruction* insn) { insn->readerCursor + offset); \ if (ret) \ return ret; \ - combined = combined | ((type)byte << ((type)offset * 8)); \ + combined = combined | ((uint64_t)byte << (offset * 8)); \ } \ *ptr = combined; \ insn->readerCursor += sizeof(type); \ @@ -719,7 +719,7 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) { * @return - 0 if the ModR/M could be read when needed or was not needed; * nonzero otherwise. */ -static int getID(struct InternalInstruction* insn, void *miiArg) { +static int getID(struct InternalInstruction* insn, const void *miiArg) { uint8_t attrMask; uint16_t instructionID; @@ -1621,10 +1621,10 @@ static int readOperands(struct InternalInstruction* insn) { */ int decodeInstruction(struct InternalInstruction* insn, byteReader_t reader, - void* readerArg, + const void* readerArg, dlog_t logger, void* loggerArg, - void* miiArg, + const void* miiArg, uint64_t startLoc, DisassemblerMode mode) { memset(insn, 0, sizeof(struct InternalInstruction)); diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 797703f..05cbb4c 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -403,7 +403,7 @@ typedef uint8_t BOOL; * be read from. * @return - -1 if the byte cannot be read for any reason; 0 otherwise. */ -typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address); +typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address); /* * dlog_t - Type for the logging function that the consumer can provide to @@ -422,7 +422,7 @@ struct InternalInstruction { /* Reader interface (C) */ byteReader_t reader; /* Opaque value passed to the reader */ - void* readerArg; + const void* readerArg; /* The address of the next byte to read via the reader */ uint64_t readerCursor; @@ -561,10 +561,10 @@ struct InternalInstruction { */ int decodeInstruction(struct InternalInstruction* insn, byteReader_t reader, - void* readerArg, + const void* readerArg, dlog_t logger, void* loggerArg, - void* miiArg, + const void* miiArg, uint64_t startLoc, DisassemblerMode mode); @@ -579,7 +579,7 @@ void x86DisassemblerDebug(const char *file, unsigned line, const char *s); -const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii); +const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii); #ifdef __cplusplus } diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 624e56f..4011035 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -941,3 +941,15 @@ and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the cost of reduced accuracy. //===---------------------------------------------------------------------===// + +This function should be matched to haddpd when the appropriate CPU is enabled: + +#include <x86intrin.h> +double f (__m128d p) { + return p[0] + p[1]; +} + +similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should +turn into hsubpd also. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 18e6b7c..d078a7b 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -120,6 +120,9 @@ def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", "Support BMI2 instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; +def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", + "HasSlowDivide", "true", + "Use small divide for positive values less than 256">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -160,7 +163,8 @@ def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B, def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : AtomProc<"atom", [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B, - FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>; + FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, + FeatureSlowDivide]>; // "Arrandale" along with corei3 and corei5 def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureFastUAMem, diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index db71e27..a4785c9 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -233,12 +233,14 @@ void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { + raw_ostream &O, const char *Modifier, + unsigned AsmVariant) { const MachineOperand &MO = MI->getOperand(OpNo); switch (MO.getType()) { default: llvm_unreachable("unknown operand type!"); case MachineOperand::MO_Register: { - O << '%'; + // FIXME: Enumerating AsmVariant, so we can remove magic number. + if (AsmVariant == 0) O << '%'; unsigned Reg = MO.getReg(); if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { EVT VT = (strcmp(Modifier+6,"64") == 0) ? @@ -471,7 +473,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } } - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant); return false; } diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 35386cd..0062387 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -50,7 +50,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // These methods are used by the tablegen'erated instruction printer. void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = 0, unsigned AsmVariant = 0); void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index d705049..e202321 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -26,7 +26,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Function.h" #include "llvm/ADT/Statistic.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCExpr.h" @@ -134,8 +133,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { IsPIC = TM.getRelocationModel() == Reloc::PIC_; do { - DEBUG(dbgs() << "JITTing function '" - << MF.getFunction()->getName() << "'\n"); + DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n"); MCE.startFunction(MF); for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index e5952aa..54704d8 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -2014,13 +2014,17 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { MVT VT; if (!isTypeLegal(C->getType(), VT)) - return false; + return 0; + + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = NULL; switch (VT.SimpleTy) { - default: return false; + default: return 0; case MVT::i8: Opc = X86::MOV8rm; RC = &X86::GR8RegClass; @@ -2058,7 +2062,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { break; case MVT::f80: // No f80 support yet. - return false; + return 0; } // Materialize addresses with LEA instructions. diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 955c75a..9d5de81 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -171,6 +171,7 @@ namespace { // Shuffle live registers to match the expectations of successor blocks. void finishBlockStack(); +#ifndef NDEBUG void dumpStack() const { dbgs() << "Stack contents:"; for (unsigned i = 0; i != StackTop; ++i) { @@ -181,6 +182,7 @@ namespace { dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]); dbgs() << "\n"; } +#endif /// getSlot - Return the stack slot number a particular register number is /// in. diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 27195b4..5fdc61e 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -100,6 +100,7 @@ namespace { Base_Reg = Reg; } +#ifndef NDEBUG void dump() { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; @@ -133,6 +134,7 @@ namespace { dbgs() << "nul"; dbgs() << " JT" << JT << " Align" << Align << '\n'; } +#endif }; } @@ -1011,7 +1013,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.IndexReg = ShVal.getNode()->getOperand(0); ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); - uint64_t Disp = AddVal->getSExtValue() << Val; + uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!FoldOffsetIntoAddress(Disp, AM)) return false; } @@ -2116,7 +2118,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Make sure that we don't change the operation by removing bits. // This only matters for OR and XOR, AND is unaffected. - if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val) + uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) break; unsigned ShlOp, Op; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7954170..5c525ae 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -182,6 +182,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); + // Bypass i32 with i8 on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) + addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext())); + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); @@ -735,6 +739,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); @@ -824,6 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); @@ -857,6 +863,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); @@ -925,6 +932,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } if (Subtarget->hasSSE41()) { @@ -939,6 +948,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1016,19 +1028,25 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + setOperationAction(ISD::FABS, MVT::v4f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1052,7 +1070,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); - if (Subtarget->hasFMA()) { + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Custom); setOperationAction(ISD::FMA, MVT::v4f64, Custom); setOperationAction(ISD::FMA, MVT::v4f32, Custom); @@ -2832,7 +2850,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); + ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3506,25 +3524,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) MatchOddMask = false; } - static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; - static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; - const int *CompactionMask; - if (MatchEvenMask) - CompactionMask = CompactionMaskEven; - else if (MatchOddMask) - CompactionMask = CompactionMaskOdd; - else + if (!MatchEvenMask && !MatchOddMask) return SDValue(); - + SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); - SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), - UndefNode, CompactionMask); - SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), - UndefNode, CompactionMask); - static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; - return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); + SDValue Op0 = SVOp->getOperand(0); + SDValue Op1 = SVOp->getOperand(1); + + if (MatchEvenMask) { + // Shift the second operand right to 32 bits. + static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; + Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); + } else { + // Shift the first operand left to 32 bits. + static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; + Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); + } + static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); } /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand @@ -4977,6 +4996,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); @@ -5881,8 +5912,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef<int> MaskVals = SVOp->getMask(); - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -5906,7 +5935,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); - if (V2IsUndef) + + // As PSHUFB will zero elements with negative indices, it's safe to ignore + // the 2nd operand if it's undefined or zero. + if (V2.getOpcode() == ISD::UNDEF || + ISD::isBuildVectorAllZeros(V2.getNode())) return V1; // Calculate the shuffle mask for the second input, shuffle it, and @@ -5992,6 +6025,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } +// v32i8 shuffles - Translate to VPSHUFB if possible. +static +SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG, + const X86TargetLowering &TLI) { + EVT VT = SVOp->getValueType(0); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); + + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // VPSHUFB may be generated if + // (1) one of input vector is undefined or zeroinitializer. + // The mask value 0x80 puts 0 in the corresponding slot of the vector. + // And (2) the mask indexes don't cross the 128-bit lane. + if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() || + (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) + return SDValue(); + + if (V1IsAllZero && !V2IsAllZero) { + CommuteVectorShuffleMask(MaskVals, 32); + V1 = V2; + } + SmallVector<SDValue, 32> pshufbMask; + for (unsigned i = 0; i != 32; i++) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || EltIdx >= 32) + EltIdx = 0x80; + else { + if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) + // Cross lane is not allowed. + return SDValue(); + EltIdx &= 0xf; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v32i8, &pshufbMask[0], 32)); +} + /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in @@ -6818,6 +6896,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } + if (VT == MVT::v32i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this); + if (NewOp.getNode()) + return NewOp; + } + // Handle all 128-bit wide vectors with 4 elements, and match them with // several different shuffle types. if (NumElems == 4 && VT.is128BitVector()) @@ -8115,26 +8199,35 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } -SDValue X86TargetLowering::LowerFABS(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - Constant *C; - if (EltVT == MVT::f64) { - C = ConstantVector::getSplat(2, - ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); - } else { - C = ConstantVector::getSplat(4, - ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); + NumElts = VT.getVectorNumElements(); } - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + C = ConstantVector::getSplat(NumElts, C); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); + if (VT.isVector()) { + MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::AND, dl, ANDVT, + DAG.getNode(ISD::BITCAST, dl, ANDVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); + } return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8154,10 +8247,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { else C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); if (VT.isVector()) { MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, @@ -9943,62 +10037,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const Op.getOperand(1), Op.getOperand(2), DAG); } - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. - case Intrinsic::x86_mmx_pslli_w: - case Intrinsic::x86_mmx_pslli_d: - case Intrinsic::x86_mmx_pslli_q: - case Intrinsic::x86_mmx_psrli_w: - case Intrinsic::x86_mmx_psrli_d: - case Intrinsic::x86_mmx_psrli_q: - case Intrinsic::x86_mmx_psrai_w: - case Intrinsic::x86_mmx_psrai_d: { - SDValue ShAmt = Op.getOperand(2); - if (isa<ConstantSDNode>(ShAmt)) - return SDValue(); - - unsigned NewIntNo; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; - break; - } - - // The vector shift intrinsics with scalars uses 32b shift amounts but - // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits - // to be zero. - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, - DAG.getConstant(0, MVT::i32)); -// FIXME this must be lowered to get rid of the invalid type. - - EVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(NewIntNo, MVT::i32), - Op.getOperand(1), ShAmt); - } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -10077,6 +10115,74 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + Opc = X86ISD::FMADD; + break; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + Opc = X86ISD::FMSUB; + break; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + Opc = X86ISD::FNMADD; + break; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + Opc = X86ISD::FNMSUB; + break; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + Opc = X86ISD::FMADDSUB; + break; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + Opc = X86ISD::FMSUBADD; + break; + } + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } } } @@ -10918,7 +11024,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); } // fall through case MVT::v4i32: @@ -14020,7 +14126,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // // where Op could be BRCOND or CMOV. // -static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { +static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Quit if not CMP and SUB with its value result used. if (Cmp.getOpcode() != X86ISD::CMP && (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) @@ -14056,40 +14162,133 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { if (SetCC.getOpcode() == ISD::ZERO_EXTEND) SetCC = SetCC.getOperand(0); - // Quit if not SETCC. - // FIXME: So far we only handle the boolean value generated from SETCC. If - // there is other ways to generate boolean values, we need handle them here - // as well. - if (SetCC.getOpcode() != X86ISD::SETCC) + switch (SetCC.getOpcode()) { + case X86ISD::SETCC: + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(1); + case X86ISD::CMOV: { + // Check whether false/true value has canonical one, i.e. 0 or 1. + ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); + ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); + // Quit if true value is not a constant. + if (!TVal) + return SDValue(); + // Quit if false value is not a constant. + if (!FVal) { + // A special case for rdrand, where 0 is set if false cond is found. + SDValue Op = SetCC.getOperand(0); + if (Op.getOpcode() != X86ISD::RDRAND) + return SDValue(); + } + // Quit if false value is not the constant 0 or 1. + bool FValIsFalse = true; + if (FVal && FVal->getZExtValue() != 0) { + if (FVal->getZExtValue() != 1) + return SDValue(); + // If FVal is 1, opposite cond is needed. + needOppositeCond = !needOppositeCond; + FValIsFalse = false; + } + // Quit if TVal is not the constant opposite of FVal. + if (FValIsFalse && TVal->getZExtValue() != 1) + return SDValue(); + if (!FValIsFalse && TVal->getZExtValue() != 0) + return SDValue(); + CC = X86::CondCode(SetCC.getConstantOperandVal(2)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(3); + } + } + + return SDValue(); +} + +/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS +/// updated. If only flag result is used and the result is evaluated from a +/// series of element extraction, try to combine it into a PTEST. +static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC, + SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDNode *N = Or.getNode(); + DebugLoc DL = N->getDebugLoc(); + + // Only SSE4.1 and beyond supports PTEST or like. + if (!Subtarget->hasSSE41()) return SDValue(); - // Set the condition code or opposite one if necessary. - CC = X86::CondCode(SetCC.getConstantOperandVal(0)); - if (needOppositeCond) - CC = X86::GetOppositeBranchCondition(CC); + if (N->getOpcode() != X86ISD::OR) + return SDValue(); - return SetCC.getOperand(1); -} + // Quit if the value result of OR is used. + if (N->hasAnyUseOfValue(0)) + return SDValue(); -static bool IsValidFCMOVCondition(X86::CondCode CC) { - switch (CC) { - default: - return false; - case X86::COND_B: - case X86::COND_BE: - case X86::COND_E: - case X86::COND_P: - case X86::COND_AE: - case X86::COND_A: - case X86::COND_NE: - case X86::COND_NP: - return true; + // Quit if not used as a boolean value. + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + SmallVector<SDValue, 8> Opnds; + SDValue VecIn; + EVT VT = MVT::Other; + unsigned Mask = 0; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Check if all elements are extracted from the same vector. + SDValue ExtractedFromVec = I->getOperand(0); + if (VecIn.getNode() == 0) { + VT = ExtractedFromVec.getValueType(); + // FIXME: only 128-bit vector is supported so far. + if (!VT.is128BitVector()) + return SDValue(); + VecIn = ExtractedFromVec; + } else if (VecIn != ExtractedFromVec) + return SDValue(); + + // Record the constant index. + Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); } + + assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far."); + + // Quit if not all elements are used. + if (Mask != (1U << VT.getVectorNumElements()) - 1U) + return SDValue(); + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); // If the flag operand isn't dead, don't touch this CMOV. @@ -14114,10 +14313,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(Cond, CC); + Flags = checkBoolTestSetCCCombine(Cond, CC); if (Flags.getNode() && // Extra check as FCMOV only supports a subset of X86 cond. - (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) { + (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { + SDValue Ops[] = { FalseOp, TrueOp, + DAG.getConstant(CC, MVT::i8), Flags }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), + Ops, array_lengthof(Ops)); + } + + Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget); + if (Flags.getNode()) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), @@ -15384,7 +15591,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes - // into FMINC and MMAXC, which are Commutative operations. + // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); @@ -15502,8 +15709,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) return SDValue(); SDValue A = N->getOperand(0); @@ -15525,9 +15737,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode; if (!NegMul) - Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); } @@ -15625,7 +15838,9 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT -static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); @@ -15641,7 +15856,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); + } + + Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); @@ -15663,7 +15884,14 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, + Flags); + } + + Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, @@ -15858,7 +16086,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); @@ -15888,7 +16116,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); - case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index b0c27c8..bfe9541 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -16,15 +16,18 @@ // // Return instructions. +// +// The X86retflag return instructions are variadic because we may add ST0 and +// ST1 arguments when returning values on the x87 stack. let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, FPForm = SpecialFP in { - def RET : I <0xC3, RawFrm, (outs), (ins), + def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), "ret", [(X86retflag 0)], IIC_RET>; def RETW : I <0xC3, RawFrm, (outs), (ins), "ret{w}", [], IIC_RET>, OpSize; - def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), + def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), "ret\t$amt", [(X86retflag timm:$amt)], IIC_RET_IMM>; def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 95ee7e5..5663800 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -19,7 +19,8 @@ let Constraints = "$src1 = $dst" in { multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, - SDPatternOperator Op = null_frag, bit MayLoad = 1> { + SDPatternOperator Op = null_frag> { + let isCommutable = 1 in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -27,7 +28,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, VR128:$src3)))]>; - let mayLoad = MayLoad in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, @@ -35,6 +36,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; + let isCommutable = 1 in def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -42,7 +44,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, VR256:$src3)))]>; - let mayLoad = MayLoad in + let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, @@ -59,7 +61,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, SDNode Op, ValueType OpTy128, ValueType OpTy256> { defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy)), - MemFrag128, MemFrag256, OpTy128, OpTy256, Op, 0>; + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; let neverHasSideEffects = 1 in { defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy)), @@ -112,148 +114,18 @@ let ExeDomain = SSEPackedDouble in { v4f64>, VEX_W; } -let Predicates = [HasFMA] in { - def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMADDSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMSUBADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMADDSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMSUBADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMADDSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMSUBADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMADDSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMSUBADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFNMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFNMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFNMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFNMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFNMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFNMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFNMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFNMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - -} // Predicates = [HasFMA] - let Constraints = "$src1 = $dst" in { multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, ValueType OpVT, PatFrag mem_frag, - SDPatternOperator OpNode = null_frag, bit MayLoad = 1> { + SDPatternOperator OpNode = null_frag> { + let isCommutable = 1 in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; - let mayLoad = MayLoad in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, @@ -266,6 +138,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop, ComplexPattern mem_cpat, Intrinsic IntId, RegisterClass RC> { + let isCommutable = 1 in def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -294,7 +167,7 @@ let neverHasSideEffects = 1 in { } defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)), - x86memop, RC, OpVT, mem_frag, OpNode, 0>, + x86memop, RC, OpVT, mem_frag, OpNode>, fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)), memop, mem_cpat, Int, RC>; } @@ -324,73 +197,102 @@ defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, //===----------------------------------------------------------------------===// -multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop, - ComplexPattern mem_cpat, Intrinsic Int> { - def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), +multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, + PatFrag mem_frag> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; - def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, memop:$src3), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4; - def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, memop:$src2, VR128:$src3), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; + [(set RC:$dst, + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>; // For disassembler let isCodeGenOnly = 1 in - def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), + def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; } -multiclass fma4p<bits<8> opc, string OpcodeStr, - Intrinsic Int128, Intrinsic Int256, +multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic Int> { + let isCommutable = 1 in + def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, + mem_cpat:$src3))]>, VEX_W, MemOp4; + def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; +} + +multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT128, ValueType OpVT256, PatFrag ld_frag128, PatFrag ld_frag256> { + let isCommutable = 1 in def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, + VEX_W, MemOp4; def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, + [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4; def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + let isCommutable = 1 in def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, - (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4; + (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, + VEX_W, MemOp4; def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, + [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4; def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, - (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>; + (OpNode VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>; // For disassembler let isCodeGenOnly = 1 in { def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), @@ -406,45 +308,58 @@ let isCodeGenOnly = 1 in { let Predicates = [HasFMA4] in { -defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; -defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; -defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps, - int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>; -defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd, - int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>; -defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; -defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; -defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps, - int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>; -defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd, - int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>; -defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; -defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; -defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps, - int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>; -defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd, - int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>; -defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; -defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; -defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps, - int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>; -defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd, - int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>; -defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps, - int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>; -defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd, - int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>; -defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps, - int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>; -defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd, - int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>; +defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; +defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; +defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; +defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; +defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; +defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; +defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; +defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + +defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, + memopv2f64, memopv4f64>; } // HasFMA4 diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 81b4f81..55ad2ec 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -287,12 +287,14 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, let CodeSize = 3; } +def __xs : XS; + // SI - SSE 1 & 2 scalar instructions class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -303,7 +305,7 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -314,18 +316,25 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin, Domain d> : I<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); } +// MMXPI - SSE 1 & 2 packed instructions with MMX operands +class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]); +} + // PIi8 - SSE 1 & 2 packed instructions with immediate class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin, Domain d> : Ii8<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm); @@ -341,18 +350,18 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>; + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>; + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, @@ -372,27 +381,31 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, // PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. // VSDI - SSE2 instructions with XD prefix in AVX form. // VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. +// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as +// MMX operands. +// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as +// MMX operands. class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; + : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>; + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>; class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, @@ -405,6 +418,12 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasAVX]>; +class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; +class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; // SSE3 Instruction Templates: // @@ -415,21 +434,23 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, - Requires<[HasSSE3]>; + Requires<[UseSSE3]>; class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, - Requires<[HasSSE3]>; + Requires<[UseSSE3]>; class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, - Requires<[HasSSE3]>; + Requires<[UseSSE3]>; // SSSE3 Instruction Templates: // // SS38I - SSSE3 instructions with T8 prefix. // SS3AI - SSSE3 instructions with TA prefix. +// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. +// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands. // // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version // uses the MMX registers. The 64-bit versions are grouped with the MMX @@ -438,10 +459,18 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSSE3]>; + Requires<[UseSSSE3]>; class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[UseSSSE3]>; +class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[HasSSSE3]>; +class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: @@ -452,11 +481,11 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSE41]>; + Requires<[UseSSE41]>; class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - Requires<[HasSSE41]>; + Requires<[UseSSE41]>; // SSE4.2 Instruction Templates: // @@ -464,9 +493,10 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSE42]>; + Requires<[UseSSE42]>; // SS42FI - SSE 4.2 instructions with T8XD prefix. +// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns. class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>; @@ -475,7 +505,7 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - Requires<[HasSSE42]>; + Requires<[UseSSE42]>; // AVX Instruction Templates: // Instructions introduced in AVX (no SSE equivalent forms) diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index ee2d3c4..9035435 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -183,8 +183,8 @@ def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; -def X86Fmaddsub : SDNode<"X86ISD::FMSUBADD", SDTFma>; -def X86Fmsubadd : SDNode<"X86ISD::FMADDSUB", SDTFma>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -240,6 +240,10 @@ def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; +// 128-/256-bit extload pattern fragments +def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 459f01a..4f3d824 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1110,6 +1110,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 }, { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 }, // FIXME: add AVX 256-bit foldable instructions + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_16 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_16 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_16 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_16 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { @@ -1237,6 +1267,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 }, { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 }, { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_16 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_16 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_16 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_16 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { @@ -1786,10 +1846,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr *MI = MBBI; MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src = MI->getOperand(1).getReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isKill = MI->getOperand(1).isKill(); + const MachineOperand &Dest = MI->getOperand(0); + const MachineOperand &Src = MI->getOperand(1); MachineInstr *NewMI = NULL; // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When @@ -1807,11 +1865,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); if (B != C) return 0; - unsigned A = MI->getOperand(0).getReg(); unsigned M = MI->getOperand(3).getImm(); NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addReg(A, RegState::Define | getDeadRegState(isDead)) - .addReg(B, getKillRegState(isKill)).addImm(M); + .addOperand(Dest).addOperand(Src).addImm(M); break; } case X86::SHUFPDrri: { @@ -1821,15 +1877,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); if (B != C) return 0; - unsigned A = MI->getOperand(0).getReg(); unsigned M = MI->getOperand(3).getImm(); // Convert to PSHUFD mask. M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addReg(A, RegState::Define | getDeadRegState(isDead)) - .addReg(B, getKillRegState(isKill)).addImm(M); + .addOperand(Dest).addOperand(Src).addImm(M); break; } case X86::SHL64ri: { @@ -1840,15 +1894,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (ShAmt == 0 || ShAmt >= 4) return 0; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR64_NOSPRegClass)) return 0; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(0).addImm(1 << ShAmt) - .addReg(Src, getKillRegState(isKill)) - .addImm(0).addReg(0); + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } case X86::SHL32ri: { @@ -1859,15 +1912,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (ShAmt == 0 || ShAmt >= 4) return 0; // LEA can't handle ESP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR32_NOSPRegClass)) return 0; unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(0).addImm(1 << ShAmt) - .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0); + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } case X86::SHL16ri: { @@ -1880,10 +1933,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(0).addImm(1 << ShAmt) - .addReg(Src, getKillRegState(isKill)) - .addImm(0).addReg(0); + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } default: { @@ -1906,14 +1957,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, RC)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) return 0; - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, 1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), 1); break; } case X86::INC16r: @@ -1921,10 +1970,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, 1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), 1); break; case X86::DEC64r: case X86::DEC32r: @@ -1936,14 +1983,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, RC)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) return 0; - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, -1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), -1); break; } case X86::DEC16r: @@ -1951,10 +1996,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, -1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), -1); break; case X86::ADD64rr: case X86::ADD64rr_DB: @@ -1981,9 +2024,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return 0; NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, Src2, isKill2); + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); // Preserve undefness of the operands. bool isUndef = MI->getOperand(1).isUndef(); @@ -2003,9 +2045,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, Src2, isKill2); + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + if (LV && isKill2) LV->replaceKillInstruction(Src2, MI, NewMI); break; @@ -2015,10 +2063,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD64ri32_DB: case X86::ADD64ri8_DB: assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, MI->getOperand(2).getImm()); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); break; case X86::ADD32ri: case X86::ADD32ri8: @@ -2026,10 +2073,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32ri8_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, MI->getOperand(2).getImm()); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); break; } case X86::ADD16ri: @@ -2039,10 +2085,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, MI->getOperand(2).getImm()); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); break; } } @@ -2051,10 +2096,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (!NewMI) return 0; if (LV) { // Update live variables - if (isKill) - LV->replaceKillInstruction(Src, MI, NewMI); - if (isDead) - LV->replaceKillInstruction(Dest, MI, NewMI); + if (Src.isKill()) + LV->replaceKillInstruction(Src.getReg(), MI, NewMI); + if (Dest.isDead()) + LV->replaceKillInstruction(Dest.getReg(), MI, NewMI); } MFI->insert(MBBI, NewMI); // Insert the new inst @@ -3444,6 +3489,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::FsFLD0SS: case X86::FsFLD0SD: return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); + case X86::AVX_SET0: + assert(HasAVX && "AVX not supported"); + return Expand2AddrUndef(MI, get(X86::VXORPSYrr)); + case X86::V_SETALLONES: + return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); + case X86::AVX2_SETALLONES: + return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr)); case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; @@ -3557,14 +3609,16 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (i == 0) { // If operand 0 - if (MI->getOpcode() == X86::MOV64r0) - NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI); - else if (MI->getOpcode() == X86::MOV32r0) - NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); - else if (MI->getOpcode() == X86::MOV16r0) - NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI); - else if (MI->getOpcode() == X86::MOV8r0) - NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI); + unsigned Opc = 0; + switch (MI->getOpcode()) { + default: break; + case X86::MOV64r0: Opc = X86::MOV64mi32; break; + case X86::MOV32r0: Opc = X86::MOV32mi; break; + case X86::MOV16r0: Opc = X86::MOV16mi; break; + case X86::MOV8r0: Opc = X86::MOV8mi; break; + } + if (Opc) + NewMI = MakeM0Inst(*this, Opc, MOs, MI); if (NewMI) return NewMI; @@ -3793,15 +3847,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = (*LoadMI->memoperands_begin())->getAlignment(); else switch (LoadMI->getOpcode()) { - case X86::AVX_SET0PSY: - case X86::AVX_SET0PDY: case X86::AVX2_SETALLONES: - case X86::AVX2_SET0: + case X86::AVX_SET0: Alignment = 32; break; case X86::V_SET0: case X86::V_SETALLONES: - case X86::AVX_SETALLONES: Alignment = 16; break; case X86::FsFLD0SD: @@ -3837,11 +3888,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, switch (LoadMI->getOpcode()) { case X86::V_SET0: case X86::V_SETALLONES: - case X86::AVX_SET0PSY: - case X86::AVX_SET0PDY: - case X86::AVX_SETALLONES: case X86::AVX2_SETALLONES: - case X86::AVX2_SET0: + case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. @@ -3873,15 +3921,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Ty = Type::getFloatTy(MF.getFunction()->getContext()); else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); - else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY) - Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); - else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0) + else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES || - Opc == X86::AVX2_SETALLONES); + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); @@ -3956,6 +4001,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, OpcodeTablePtr = &RegOp2MemOpTable1; } else if (OpNum == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (OpNum == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; } if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index d293156..304676d 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -114,7 +114,7 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, - [SDNPHasChain]>; + [SDNPHasChain,SDNPSideEffect]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, [SDNPHasChain]>; def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER, @@ -552,14 +552,21 @@ def HasMMX : Predicate<"Subtarget->hasMMX()">; def Has3DNow : Predicate<"Subtarget->has3DNow()">; def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && Subtarget->hasNoAVX()">; def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && Subtarget->hasNoAVX()">; def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && Subtarget->hasNoAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && Subtarget->hasNoAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && Subtarget->hasNoAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && Subtarget->hasNoAVX()">; def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index c8f40bb..bd54858 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -118,11 +118,11 @@ let Constraints = "$src1 = $dst" in { /// Unary MMX instructions requiring SSSE3. multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, Intrinsic IntId64, OpndItins itins> { - def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>; - def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, (IntId64 (bitconvert (memopmmx addr:$src))))], @@ -134,11 +134,11 @@ let ImmT = NoImm, Constraints = "$src1 = $dst" in { multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, Intrinsic IntId64, OpndItins itins> { let isCommutable = 0 in - def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>; - def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, @@ -149,11 +149,11 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, /// PALIGN MMX instructions (require SSSE3). multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { - def R64irr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), + def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>; - def R64irm : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), + def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, @@ -163,12 +163,10 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, OpndItins itins, Domain d> { - def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (Int SrcRC:$src))], - itins.rr, d>; - def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (Int (ld_frag addr:$src)))], - itins.rm, d>; + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>; } multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, @@ -243,29 +241,30 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), [(store (x86mmx VR64:$src), addr:$dst)], IIC_MMX_MOVQ_RM>; -def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), - (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (x86mmx (bitconvert - (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))))], - IIC_MMX_MOVQ_RR>; - -def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), - (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2i64 (scalar_to_vector - (i64 (bitconvert (x86mmx VR64:$src))))))], - IIC_MMX_MOVQ_RR>; +def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (bitconvert + (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))))], + IIC_MMX_MOVQ_RR>; + +def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 + (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src))))))], + IIC_MMX_MOVQ_RR>; let neverHasSideEffects = 1 in -def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), - (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [], - IIC_MMX_MOVQ_RR>; +def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; -def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), - (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [], - IIC_MMX_MOVQ_RR>; +def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", @@ -577,6 +576,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), IIC_MMX_MASKMOV>; // 64-bit bit convert. +let Predicates = [HasSSE2] in { def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), (MMX_MOVD64to64rr GR64:$src)>; def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), @@ -585,5 +585,6 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), (MMX_MOVFR642Qrr FR64:$src)>; +} diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 220c06d..17e91a6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -251,35 +251,37 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), // A 128-bit subvector extract from the first 256-bit vector position // is a subregister copy that needs no instruction. -def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))), +def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))), +def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; -def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))), +def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))), +def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; -def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))), +def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; -def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))), +def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; // A 128-bit subvector insert to the first 256-bit vector position // is a subregister copy that needs no instruction. -def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)), +let AddedComplexity = 25 in { // to give priority over vinsertf128rm +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +} // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), @@ -362,7 +364,7 @@ let Predicates = [HasAVX] in { def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; } -// Alias instructions that map fld0 to pxor for sse. +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1 in { @@ -382,11 +384,11 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, neverHasSideEffects = 1 in { -def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>; + isPseudo = 1 in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; } -def : Pat<(v4f32 immAllZerosV), (V_SET0)>; def : Pat<(v2f64 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; def : Pat<(v2i64 immAllZerosV), (V_SET0)>; @@ -394,35 +396,29 @@ def : Pat<(v8i16 immAllZerosV), (V_SET0)>; def : Pat<(v16i8 immAllZerosV), (V_SET0)>; -// The same as done above but for AVX. The 256-bit ISA does not support PI, +// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, // and doesn't need it because on sandy bridge the register is set to zero // at the rename stage without using any execution unit, so SET0PSY // and SET0PDY can be used for vector int instructions without penalty -// FIXME: Change encoding to pseudo! This is blocked right now by the x86 -// JIT implementatioan, it does not expand the instructions below like -// X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1 in { -let Predicates = [HasAVX] in { -def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", - [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V; -def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", - [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V; -} -let Predicates = [HasAVX2], neverHasSideEffects = 1 in -def AVX2_SET0 : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "", - []>, VEX_4V; + isPseudo = 1, Predicates = [HasAVX] in { +def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8f32 immAllZerosV))]>; } -let Predicates = [HasAVX2], AddedComplexity = 5 in { - def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>; - def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>; - def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>; - def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>; +let Predicates = [HasAVX] in + def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; + +let Predicates = [HasAVX2] in { + def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; + def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>; + def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; + def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; } -// AVX has no support for 256-bit integer instructions, but since the 128-bit +// AVX1 has no support for 256-bit integer instructions, but since the 128-bit // VPXOR instruction writes zero to its upper part, it's safe build zeros. +let Predicates = [HasAVX1Only] in { def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; @@ -438,22 +434,17 @@ def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +} // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. -// FIXME: Change encoding to pseudo! This is blocked right now by the x86 -// JIT implementation, it does not expand the instructions below like -// X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt in { - let Predicates = [HasAVX] in - def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; - def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllOnesV))]>; + isPseudo = 1 in { + def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; let Predicates = [HasAVX2] in - def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "", - [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V; + def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; } @@ -605,27 +596,27 @@ let Predicates = [HasAVX] in { // Represent the same patterns above but in the form they appear for // 256-bit types def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))), + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))), + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))), + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))), + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))), + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; // Move low f64 and clear high bits. @@ -704,7 +695,7 @@ let Predicates = [HasAVX] in { (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. @@ -738,7 +729,7 @@ let Predicates = [HasSSE1] in { (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. @@ -916,16 +907,16 @@ let isCodeGenOnly = 1 in { let Predicates = [HasAVX] in { def : Pat<(v8i32 (X86vzmovl - (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))), + (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl - (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))), + (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl - (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))), + (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl - (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))), + (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; } @@ -975,10 +966,10 @@ let Predicates = [HasAVX] in { (VMOVUPDmr addr:$dst, VR128:$src)>; } -let Predicates = [HasSSE1] in +let Predicates = [UseSSE1] in def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), (MOVUPSmr addr:$dst, VR128:$src)>; -let Predicates = [HasSSE2] in +let Predicates = [UseSSE2] in def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), (MOVUPDmr addr:$dst, VR128:$src)>; @@ -1028,12 +1019,52 @@ let Predicates = [HasAVX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + def : Pat<(alignedstore (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + + def : Pat<(store (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } // Use movaps / movups for SSE integer load / store (one byte shorter). // The instructions selected below are then converted to MOVDQA/MOVDQU // during the SSE domain pass. -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { def : Pat<(alignedloadv2i64 addr:$src), (MOVAPSrm addr:$src)>; def : Pat<(loadv2i64 addr:$src), @@ -1180,7 +1211,7 @@ let Predicates = [HasAVX] in { (VMOVLPDmr addr:$src1, VR128:$src2)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), (iPTR 0))), addr:$src1), @@ -1205,7 +1236,7 @@ let Predicates = [HasSSE1] in { (MOVLPSmr addr:$src1, VR128:$src2)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; @@ -1279,7 +1310,7 @@ let Predicates = [HasAVX] in { (VMOVHPDrm VR128:$src1, addr:$src2)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { // MOVHPS patterns def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), @@ -1289,7 +1320,7 @@ let Predicates = [HasSSE1] in { (MOVHPSrm VR128:$src1, addr:$src2)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1346,7 +1377,7 @@ let Predicates = [HasAVX] in { (VMOVHLPSrr VR128:$src1, VR128:$src2)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { // MOVLHPS patterns def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (MOVLHPSrr VR128:$src1, VR128:$src2)>; @@ -1456,7 +1487,7 @@ def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}", def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; -let Predicates = [HasAVX], AddedComplexity = 1 in { +let Predicates = [HasAVX] in { def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -1633,7 +1664,7 @@ defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - TB, Requires<[HasSSE2]>; + TB, Requires<[UseSSE2]>; /// SSE 2 Only @@ -1663,7 +1694,7 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fround (loadf64 addr:$src)))], IIC_SSE_CVT_Scalar_RM>, XD, - Requires<[HasSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>; def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1684,13 +1715,13 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>; def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>; } // Convert scalar single to scalar double @@ -1709,30 +1740,28 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>; } -let AddedComplexity = 1 in { // give AVX priority - def : Pat<(f64 (fextend FR32:$src)), - (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(fextend (loadf32 addr:$src)), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>; +def : Pat<(f64 (fextend FR32:$src)), + (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>; - def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[HasAVX, OptForSpeed]>; -} // AddedComplexity = 1 +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + Requires<[HasAVX, OptForSpeed]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fextend FR32:$src))], IIC_SSE_CVT_Scalar_RR>, XS, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (extloadf32 addr:$src))], IIC_SSE_CVT_Scalar_RM>, XS, - Requires<[HasSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>; // extload f32 -> f64. This matches load+fextend because we have a hack in // the isel (PreprocessForFPConvert) that can introduce loads after dag @@ -1740,9 +1769,9 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), // Since these loads aren't folded into the fextend, we have to match it // explicitly here. def : Pat<(fextend (loadf32 addr:$src)), - (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>; + (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; def : Pat<(extloadf32 addr:$src), - (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>; + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1762,13 +1791,13 @@ def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>; def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>; } // Convert packed single/double fp to doubleword @@ -1904,7 +1933,7 @@ let Predicates = [HasAVX] in { (VCVTTPS2DQYrm addr:$src)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (CVTDQ2PSrr VR128:$src)>; def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), @@ -1978,10 +2007,10 @@ def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], IIC_SSE_CVT_PD_RR>, TB, VEX; -let neverHasSideEffects = 1, mayLoad = 1 in def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RM>, TB, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, VEX; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, @@ -1994,15 +2023,15 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), IIC_SSE_CVT_PD_RM>, TB, VEX; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], IIC_SSE_CVT_PD_RR>, TB; -let neverHasSideEffects = 1, mayLoad = 1 in def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "cvtps2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RM>, TB; + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB; } // Convert Packed DW Integers to Packed Double FP @@ -2105,11 +2134,11 @@ let Predicates = [HasAVX] in { (VCVTPS2PDrr VR128:$src)>; def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), (VCVTPS2PDYrr VR128:$src)>; - def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), + def : Pat<(v4f64 (extloadv4f32 addr:$src)), (VCVTPS2PDYrm addr:$src)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // Match fextend for 128 conversions def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), (CVTPS2PDrr VR128:$src)>; @@ -2336,14 +2365,14 @@ def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), @@ -2420,7 +2449,7 @@ let Predicates = [HasAVX] in { (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { def : Pat<(v4i32 (X86Shufp VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; @@ -2428,7 +2457,7 @@ let Predicates = [HasSSE1] in { (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // Generic SHUFPD patterns def : Pat<(v2i64 (X86Shufp VR128:$src1, (memopv2i64 addr:$src2), (i8 imm:$imm))), @@ -2500,7 +2529,27 @@ let Constraints = "$src1 = $dst" in { SSEPackedDouble>, TB, OpSize; } // Constraints = "$src1 = $dst" -let Predicates = [HasAVX], AddedComplexity = 1 in { +let Predicates = [HasAVX1Only] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; +} + +let Predicates = [HasAVX] in { // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel @@ -2509,7 +2558,7 @@ let Predicates = [HasAVX], AddedComplexity = 1 in { (VUNPCKLPDrr VR128:$src, VR128:$src)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel @@ -2578,16 +2627,16 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", def : Pat<(i32 (X86fgetsign FR32:$src)), (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; def : Pat<(i64 (X86fgetsign FR32:$src)), (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; def : Pat<(i32 (X86fgetsign FR64:$src)), (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; def : Pat<(i64 (X86fgetsign FR64:$src)), (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions @@ -2683,14 +2732,12 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, } // Alias bitwise logical operations using SSE logical ops on packed FP values. -let mayLoad = 0 in { - defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, - SSE_BIT_ITINS_P>; - defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, - SSE_BIT_ITINS_P>; - defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, - SSE_BIT_ITINS_P>; -} +defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; +defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + SSE_BIT_ITINS_P>; +defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, @@ -2794,27 +2841,23 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins, bit Is2Addr = 1> { - let mayLoad = 0 in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>, TB; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>, TB, OpSize; - } } multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { - let mayLoad = 0 in { - defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, + defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>, TB; - defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, + defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>, TB, OpSize; - } } multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, @@ -2924,7 +2967,7 @@ let Constraints = "$src1 = $dst" in { } } -let isCommutable = 1, isCodeGenOnly = 1 in { +let isCodeGenOnly = 1 in { defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>, @@ -2978,7 +3021,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, - Requires<[HasSSE1, OptForSize]>; + Requires<[UseSSE1, OptForSize]>; def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>; @@ -2992,7 +3035,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - let mayLoad = 1 in + let mayLoad = 1 in { def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; @@ -3000,6 +3043,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { (ins VR128:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + } } /// sse1_fp_unop_p - SSE1 unops in packed form. @@ -3062,7 +3106,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, - Requires<[HasSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>; def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>; @@ -3072,20 +3116,20 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, } /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form. +let hasSideEffects = 0 in multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { - let neverHasSideEffects = 1 in { def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - let mayLoad = 1 in + let mayLoad = 1 in { def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - } def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + } } /// sse2_fp_unop_p - SSE2 unops in vector forms. @@ -3176,7 +3220,6 @@ let Predicates = [HasAVX] in { SSE_RCPP>, VEX; } -let AddedComplexity = 1 in { def : Pat<(f32 (fsqrt FR32:$src)), (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -3199,9 +3242,8 @@ def : Pat<(f32 (X86frcp FR32:$src)), def : Pat<(f32 (X86frcp (load addr:$src))), (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX, OptForSize]>; -} -let Predicates = [HasAVX], AddedComplexity = 1 in { +let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), @@ -3322,7 +3364,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), IIC_SSE_MOVNT>; def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), - (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; + (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>; // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), @@ -3482,7 +3524,7 @@ def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; // For Disassembler let isCodeGenOnly = 1 in { @@ -3492,7 +3534,7 @@ def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; } let canFoldAsLoad = 1, mayLoad = 1 in { @@ -3504,7 +3546,7 @@ def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqu\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], IIC_SSE_MOVU_P_RM>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } let mayStore = 1 in { @@ -3516,7 +3558,7 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [/*(store (v2i64 VR128:$src), addr:$dst)*/], IIC_SSE_MOVU_P_MR>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } // Intrinsic forms of MOVDQU load and store @@ -3530,7 +3572,7 @@ def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)], IIC_SSE_MOVU_P_MR>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } // ExeDomain = SSEPackedInt @@ -4028,7 +4070,7 @@ let Predicates = [HasAVX2] in { (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), @@ -4210,7 +4252,7 @@ let Predicates = [HasAVX2] in { defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { let AddedComplexity = 5 in defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize; @@ -4325,28 +4367,6 @@ let Constraints = "$src1 = $dst" in { } } // ExeDomain = SSEPackedInt -// Patterns for using AVX1 instructions with integer vectors -// Here to give AVX2 priority -let Predicates = [HasAVX] in { - def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), - (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), - (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), - (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), - (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - - def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), - (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), - (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; -} - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Extract and Insert //===---------------------------------------------------------------------===// @@ -4395,7 +4415,7 @@ let Predicates = [HasAVX] in { } let Constraints = "$src1 = $dst" in - defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>; + defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>; } // ExeDomain = SSEPackedInt @@ -4556,7 +4576,7 @@ def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), // Move Packed Doubleword Int first element to Doubleword Int // def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "vmov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, @@ -4672,14 +4692,14 @@ let Predicates = [HasAVX] in { } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))), + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))), + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } -let Predicates = [HasSSE2], AddedComplexity = 20 in { +let Predicates = [UseSSE2], AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (MOVZDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -4719,7 +4739,7 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), [(set VR128:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))], IIC_SSE_MOVDQ>, XS, - Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix + Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix //===---------------------------------------------------------------------===// // Move Packed Quadword Int to Quadword Int @@ -4762,7 +4782,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; let Predicates = [HasAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), @@ -4773,7 +4793,7 @@ let Predicates = [HasAVX], AddedComplexity = 20 in { (VMOVZQI2PQIrm addr:$src)>; } -let Predicates = [HasSSE2], AddedComplexity = 20 in { +let Predicates = [UseSSE2], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), @@ -4803,7 +4823,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), @@ -4818,7 +4838,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (loadv2i64 addr:$src))))], IIC_SSE_MOVDQ>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } let AddedComplexity = 20 in { @@ -4828,7 +4848,7 @@ let AddedComplexity = 20 in { def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (VMOVZPQILo2PQIrr VR128:$src)>; } - let Predicates = [HasSSE2] in { + let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVZPQILo2PQIrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), @@ -4908,7 +4928,7 @@ let Predicates = [HasAVX] in { (VMOVSLDUPYrm addr:$src)>; } -let Predicates = [HasSSE3] in { +let Predicates = [UseSSE3] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (MOVSHDUPrr VR128:$src)>; def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), @@ -4977,7 +4997,7 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } -let Predicates = [HasSSE3] in { +let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), @@ -5041,7 +5061,7 @@ let Predicates = [HasAVX] in { f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; } } -let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in { +let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { let ExeDomain = SSEPackedSingle in defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, f128mem, SSE_ALU_F32P>, TB, XD; @@ -5424,7 +5444,7 @@ let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; let Predicates = [HasAVX2] in defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V; -let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in +let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGN : ssse3_palign<"palignr">; let Predicates = [HasAVX2] in { @@ -5449,7 +5469,7 @@ def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; } -let Predicates = [HasSSSE3] in { +let Predicates = [UseSSSE3] in { def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), @@ -5583,7 +5603,7 @@ let Predicates = [HasAVX] in { (VPMOVZXDQrm addr:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { // Common patterns involving scalar load. def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), (PMOVSXBWrm addr:$src)>; @@ -5633,7 +5653,7 @@ let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; } @@ -5704,7 +5724,7 @@ let Predicates = [HasAVX] in { (VPMOVZXWQrm addr:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), (PMOVSXBDrm addr:$src)>; @@ -5772,7 +5792,7 @@ let Predicates = [HasAVX] in { (VPMOVZXBQrm addr:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq (bitconvert (v4i32 (X86vzmovl @@ -5918,7 +5938,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))), addr:$dst), (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[HasSSE41]>; + Requires<[UseSSE41]>; //===----------------------------------------------------------------------===// // SSE4.1 - Insert Instructions @@ -6190,6 +6210,15 @@ let Predicates = [HasAVX] in { (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; def : Pat<(f64 (ftrunc FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (ffloor VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x1))>; + def : Pat<(v8f32 (ffloor VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x1))>; + def : Pat<(v4f64 (ffloor VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x1))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6199,26 +6228,33 @@ let Constraints = "$src1 = $dst" in defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; -def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; -def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; -def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; -def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; -def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; -def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; -def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; -def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; -def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; -def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; +let Predicates = [UseSSE41] in { + def : Pat<(ffloor FR32:$src), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (ffloor VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x1))>; +} //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test @@ -6356,7 +6392,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; } -/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator +/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, Intrinsic IntId256> { let isCommutable = 1 in @@ -6705,7 +6741,7 @@ def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), (v16i8 VR128:$src2))), (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; @@ -6802,9 +6838,8 @@ multiclass pseudo_pcmpistrm<string asm> { } let Defs = [EFLAGS], usesCustomInserter = 1 in { - let AddedComplexity = 1 in - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; - defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>; + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; } let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in { @@ -6840,9 +6875,8 @@ multiclass pseudo_pcmpestrm<string asm> { } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { - let AddedComplexity = 1 in - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; - defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>; + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; } let Predicates = [HasAVX], @@ -7237,40 +7271,59 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), let Predicates = [HasAVX] in { def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +let Predicates = [HasAVX1Only] in { def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), - (i32 imm)), +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), + (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), - (i32 imm)), +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (memopv2i64 addr:$src2)), + (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), - (i32 imm)), +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (memopv2i64 addr:$src2)), + (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; } @@ -7290,56 +7343,61 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), []>, VEX; } -// Extract and store. -let Predicates = [HasAVX] in { - def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - - def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; -} - // AVX1 patterns let Predicates = [HasAVX] in { -def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), - (VEXTRACTF128rr VR256:$src1, imm:$src2)>; -def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), - (VEXTRACTF128rr VR256:$src1, imm:$src2)>; -def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), - (VEXTRACTF128rr VR256:$src1, imm:$src2)>; - -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v4f32 (VEXTRACTF128rr (v8f32 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v2f64 (VEXTRACTF128rr (v4f64 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + +def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTF128rr - (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTF128rr - (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTF128rr - (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTF128rr - (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -7456,29 +7514,29 @@ def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX] in { +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +let Predicates = [HasAVX1Only] in { def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, - (memopv8f32 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, - (memopv4f64 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; @@ -7665,19 +7723,22 @@ let Predicates = [HasAVX2] in { } // AVX1 broadcast patterns -let Predicates = [HasAVX] in { +let Predicates = [HasAVX1Only] in { def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VBROADCASTSDYrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), (VBROADCASTSDYrm addr:$src)>; def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. @@ -7757,7 +7818,6 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // -let AddedComplexity = 1 in { def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -7768,9 +7828,8 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), (i8 imm:$src3)))]>, VEX_4V; -} -let Predicates = [HasAVX2], AddedComplexity = 1 in { +let Predicates = [HasAVX2] in { def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), @@ -7805,23 +7864,43 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), []>, VEX_4V; } -let Predicates = [HasAVX2], AddedComplexity = 1 in { +let Predicates = [HasAVX2] in { def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -7838,23 +7917,40 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, i8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; -let Predicates = [HasAVX2], AddedComplexity = 1 in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +let Predicates = [HasAVX2] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTI128rr (v4i64 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTI128rr (v8i32 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTI128rr (v16i16 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTI128rr (v32i8 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index 7ac4cec..764aa5d 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -532,7 +532,7 @@ uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) { #endif } -template<typename T> void addUnaligned(void *Pos, T Delta) { +template<typename T> static void addUnaligned(void *Pos, T Delta) { T Value; std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos), sizeof(T)); diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 9c0ce4e..1c2ef25 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -377,12 +377,6 @@ ReSimplify: case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break; case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; - case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; - case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; - case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; - case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; - case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break; - case X86::AVX2_SET0: LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break; case X86::MOV16r0: LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 877b8f6..3b4cfc4 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -522,7 +522,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS) const{ + int SPAdj, RegScavenger *RS) const { assert(SPAdj == 0 && "Unexpected"); unsigned i = 0; diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 9087852..0d7b664 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -346,6 +346,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , HasVectorUAMem(false) , HasCmpxchg16b(false) , UseLeaForSP(false) + , HasSlowDivide(false) , PostRAScheduler(false) , stackAlignment(4) // FIXME: this is a known good value for Yonah. How about others? diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 6841c5b..dde7e24 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -136,6 +136,10 @@ protected: /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// HasSlowDivide - True if smaller divides are significantly faster than + /// full divides and should be used when possible. + bool HasSlowDivide; + /// PostRAScheduler - True if using post-register-allocation scheduler. bool PostRAScheduler; @@ -198,6 +202,7 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasNoAVX() const { return X86SSELevel < AVX; } bool hasSSE4A() const { return HasSSE4A; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } @@ -205,7 +210,8 @@ public: bool hasAES() const { return HasAES; } bool hasPCLMUL() const { return HasPCLMUL; } bool hasFMA() const { return HasFMA; } - bool hasFMA4() const { return HasFMA4; } + // FIXME: Favor FMA when both are enabled. Is this the right thing to do? + bool hasFMA4() const { return HasFMA4 && !HasFMA; } bool hasXOP() const { return HasXOP; } bool hasMOVBE() const { return HasMOVBE; } bool hasRDRAND() const { return HasRDRAND; } @@ -219,6 +225,7 @@ public: bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasSlowDivide() const { return HasSlowDivide; } bool isAtom() const { return X86ProcFamily == IntelAtom; } diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 80b75dc..449eed3 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -42,7 +42,6 @@ namespace { private: const TargetInstrInfo *TII; // Machine instruction info. - MachineBasicBlock *MBB; // Current basic block // Any YMM register live-in to this function? bool FnHasLiveInYmm; @@ -84,7 +83,7 @@ namespace { // 2) All states must be clean for the result to be clean // 3) If none above and one unknown, the result state is also unknown // - unsigned computeState(unsigned PrevState, unsigned CurState) { + static unsigned computeState(unsigned PrevState, unsigned CurState) { if (PrevState == ST_INIT) return CurState; @@ -122,7 +121,7 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { } static bool hasYmmReg(MachineInstr *MI) { - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg()) continue; @@ -189,7 +188,6 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; unsigned BBNum = BB.getNumber(); - MBB = &BB; // Don't process already solved BBs if (BBSolved[BBNum]) @@ -207,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // The entry MBB for the function may set the initial state to dirty if // the function receives any YMM incoming arguments - if (MBB == MF.begin()) { + if (&BB == MF.begin()) { EntryState = ST_CLEAN; if (FnHasLiveInYmm) EntryState = ST_DIRTY; @@ -253,7 +251,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // When unknown, only compute the information within the block to have // it available in the exit if possible, but don't change the block. if (EntryState != ST_UNKNOWN) { - BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); ++NumVZU; } diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index ae646a2..3e7666b 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -33,7 +33,7 @@ def XCoreBranchLink : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink, SDNPVariadic]>; def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind, - [SDNPHasChain, SDNPOptInGlue]>; + [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad]>; def SDT_XCoreBR_JT : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; @@ -58,7 +58,7 @@ def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress, def SDT_XCoreStwsp : SDTypeProfile<0, 2, [SDTCisInt<1>]>; def XCoreStwsp : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp, - [SDNPHasChain]>; + [SDNPHasChain, SDNPMayStore]>; // These are target-independent nodes, but have target-specific formats. def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index cdd0a08..be5855a 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -176,7 +176,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, #ifndef NDEBUG DEBUG(errs() << "\nFunction : " - << MF.getFunction()->getName() << "\n"); + << MF.getName() << "\n"); DEBUG(errs() << "<--------->\n"); DEBUG(MI.print(errs())); DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"); |